{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../../FinNLP\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CNBS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.cnbc_streaming import CNBC_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading ... 0 1 2 "
     ]
    }
   ],
   "source": [
    "news_downloader = CNBC_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(30, 30)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['description', 'cn:lastPubDate', 'dateModified', 'cn:dateline',\n",
       "       'cn:branding', 'section', 'cn:type', 'author', 'cn:source',\n",
       "       'cn:subtype', 'duration', 'summary', 'expires', 'cn:sectionSubType',\n",
       "       'cn:contentClassification', 'pubdateunix', '_id', 'url', '@id',\n",
       "       'datePublished', 'cn:promoImage', 'cn:title', 'cn:keyword',\n",
       "       'cn:liveURL', '_pubDate', '_type', '_index', 'brand', 'hint',\n",
       "       'hint_detail'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>description</th>\n",
       "      <th>cn:lastPubDate</th>\n",
       "      <th>dateModified</th>\n",
       "      <th>cn:dateline</th>\n",
       "      <th>cn:branding</th>\n",
       "      <th>section</th>\n",
       "      <th>cn:type</th>\n",
       "      <th>author</th>\n",
       "      <th>cn:source</th>\n",
       "      <th>cn:subtype</th>\n",
       "      <th>...</th>\n",
       "      <th>cn:promoImage</th>\n",
       "      <th>cn:title</th>\n",
       "      <th>cn:keyword</th>\n",
       "      <th>cn:liveURL</th>\n",
       "      <th>_pubDate</th>\n",
       "      <th>_type</th>\n",
       "      <th>_index</th>\n",
       "      <th>brand</th>\n",
       "      <th>hint</th>\n",
       "      <th>hint_detail</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>While Leah Ellis was earning her doctorate at ...</td>\n",
       "      <td>2023-06-24T10:00:01+0000</td>\n",
       "      <td>2023-06-24T10:00:01+0000</td>\n",
       "      <td></td>\n",
       "      <td>cnbc</td>\n",
       "      <td>Clean Tech</td>\n",
       "      <td>cnbcnewsstory</td>\n",
       "      <td>Catherine Clifford</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>...</td>\n",
       "      <td>https://image.cnbcfm.com/api/v1/image/10726095...</td>\n",
       "      <td>Meet the 33-year-old Canadian chemist and the ...</td>\n",
       "      <td></td>\n",
       "      <td>https://www.cnbc.com/2023/06/24/sublime-system...</td>\n",
       "      <td>6/24/2023 10:00:01 PM</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>cnbc</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Amazon.com said on Friday it will take its inv...</td>\n",
       "      <td>2023-06-24T04:50:41+0000</td>\n",
       "      <td>2023-06-24T04:50:41+0000</td>\n",
       "      <td></td>\n",
       "      <td>cnbc</td>\n",
       "      <td>Technology</td>\n",
       "      <td>wirestory</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>...</td>\n",
       "      <td>https://image.cnbcfm.com/api/v1/image/10726178...</td>\n",
       "      <td>Amazon raises investment in India to $26 billi...</td>\n",
       "      <td></td>\n",
       "      <td>https://www.cnbc.com/2023/06/24/amazon-commits...</td>\n",
       "      <td>6/24/2023 1:49:10 PM</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>cnbc</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 30 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         description  \\\n",
       "0  While Leah Ellis was earning her doctorate at ...   \n",
       "1  Amazon.com said on Friday it will take its inv...   \n",
       "\n",
       "             cn:lastPubDate              dateModified cn:dateline cn:branding  \\\n",
       "0  2023-06-24T10:00:01+0000  2023-06-24T10:00:01+0000                    cnbc   \n",
       "1  2023-06-24T04:50:41+0000  2023-06-24T04:50:41+0000                    cnbc   \n",
       "\n",
       "      section        cn:type              author cn:source cn:subtype  ...  \\\n",
       "0  Clean Tech  cnbcnewsstory  Catherine Clifford        []             ...   \n",
       "1  Technology      wirestory                            []             ...   \n",
       "\n",
       "                                       cn:promoImage  \\\n",
       "0  https://image.cnbcfm.com/api/v1/image/10726095...   \n",
       "1  https://image.cnbcfm.com/api/v1/image/10726178...   \n",
       "\n",
       "                                            cn:title cn:keyword  \\\n",
       "0  Meet the 33-year-old Canadian chemist and the ...              \n",
       "1  Amazon raises investment in India to $26 billi...              \n",
       "\n",
       "                                          cn:liveURL               _pubDate  \\\n",
       "0  https://www.cnbc.com/2023/06/24/sublime-system...  6/24/2023 10:00:01 PM   \n",
       "1  https://www.cnbc.com/2023/06/24/amazon-commits...   6/24/2023 1:49:10 PM   \n",
       "\n",
       "   _type  _index brand hint hint_detail  \n",
       "0      0       0  cnbc  NaN         NaN  \n",
       "1      0       1  cnbc  NaN         NaN  \n",
       "\n",
       "[2 rows x 30 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>datePublished</th>\n",
       "      <th>cn:lastPubDate</th>\n",
       "      <th>dateModified</th>\n",
       "      <th>description</th>\n",
       "      <th>section</th>\n",
       "      <th>author</th>\n",
       "      <th>summary</th>\n",
       "      <th>cn:title</th>\n",
       "      <th>cn:type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2023-06-24T14:00:01+0000</td>\n",
       "      <td>2023-06-24T10:00:01+0000</td>\n",
       "      <td>2023-06-24T10:00:01+0000</td>\n",
       "      <td>While Leah Ellis was earning her doctorate at ...</td>\n",
       "      <td>Clean Tech</td>\n",
       "      <td>Catherine Clifford</td>\n",
       "      <td>Sublime Systems is scaling up a green cement. ...</td>\n",
       "      <td>Meet the 33-year-old Canadian chemist and the ...</td>\n",
       "      <td>cnbcnewsstory</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2023-06-24T05:49:10+0000</td>\n",
       "      <td>2023-06-24T04:50:41+0000</td>\n",
       "      <td>2023-06-24T04:50:41+0000</td>\n",
       "      <td>Amazon.com said on Friday it will take its inv...</td>\n",
       "      <td>Technology</td>\n",
       "      <td></td>\n",
       "      <td>Modi and Jassy spoke about supporting Indian s...</td>\n",
       "      <td>Amazon raises investment in India to $26 billi...</td>\n",
       "      <td>wirestory</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2023-06-23T22:12:07+0000</td>\n",
       "      <td>2023-06-23T18:29:45+0000</td>\n",
       "      <td>2023-06-23T18:29:45+0000</td>\n",
       "      <td>As Microsoft attempts to convince regulators t...</td>\n",
       "      <td>Technology</td>\n",
       "      <td>Jordan Novet</td>\n",
       "      <td>Microsoft has been eager to grow in mobile gam...</td>\n",
       "      <td>Microsoft says it looked at acquiring Zynga bu...</td>\n",
       "      <td>cnbcnewsstory</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2023-06-23T21:51:15+0000</td>\n",
       "      <td>2023-06-23T17:51:15+0000</td>\n",
       "      <td>2023-06-23T17:51:15+0000</td>\n",
       "      <td>The CEOs of Apple, Alphabet, Microsoft got a h...</td>\n",
       "      <td>Technology</td>\n",
       "      <td>Seema Mody</td>\n",
       "      <td>Top tech execs met with Indian Prime Minister ...</td>\n",
       "      <td>Apple's Tim Cook calls India 'huge opportunity...</td>\n",
       "      <td>cnbcnewsstory</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2023-06-23T17:32:48+0000</td>\n",
       "      <td>2023-06-23T13:36:59+0000</td>\n",
       "      <td>2023-06-23T13:36:59+0000</td>\n",
       "      <td>Tech executives like Apple CEO Tim Cook visit ...</td>\n",
       "      <td>Fast Money Halftime Report</td>\n",
       "      <td>Seema Mody</td>\n",
       "      <td>Tech executives like Apple CEO Tim Cook visit ...</td>\n",
       "      <td>Tech CEOs meet President Biden and Indian PM M...</td>\n",
       "      <td>cnbcvideo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2023-06-23T12:30:17+0000</td>\n",
       "      <td>2023-06-23T13:32:42+0000</td>\n",
       "      <td>2023-06-23T13:32:42+0000</td>\n",
       "      <td>Anyone want to buy or sell this tech rally? To...</td>\n",
       "      <td>Pro: Pro Columnists</td>\n",
       "      <td>Bob Pisani</td>\n",
       "      <td>Following the rebalancing of S&amp;P indexes last ...</td>\n",
       "      <td>Friday could offer a once-in-a-year chance to ...</td>\n",
       "      <td>cnbcnewsstory</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2023-06-23T12:26:42+0000</td>\n",
       "      <td>2023-06-23T11:47:06+0000</td>\n",
       "      <td>2023-06-23T11:47:06+0000</td>\n",
       "      <td>Here are Friday's biggest calls on Wall Street...</td>\n",
       "      <td>Pro: Analyst Stock Picks</td>\n",
       "      <td>Michael Bloom</td>\n",
       "      <td>Here are Friday's biggest calls on Wall Street.</td>\n",
       "      <td>Here are Friday's biggest analyst calls: Meta,...</td>\n",
       "      <td>cnbcnewsstory</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2023-06-23T06:30:01+0000</td>\n",
       "      <td>2023-06-23T02:30:01+0000</td>\n",
       "      <td>2023-06-23T02:30:01+0000</td>\n",
       "      <td>This report is from today's CNBC Daily Open, o...</td>\n",
       "      <td>Daily Open</td>\n",
       "      <td>Yeo Boon Ping</td>\n",
       "      <td>Investors have been lulled by a sense of secur...</td>\n",
       "      <td>CNBC Daily Open: Seeking shelter in tech</td>\n",
       "      <td>cnbcnewsstory</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2023-06-23T05:45:33+0000</td>\n",
       "      <td>2023-06-23T10:37:42+0000</td>\n",
       "      <td>2023-06-23T10:37:42+0000</td>\n",
       "      <td>AMSTERDAM — Artificial intelligence has a raci...</td>\n",
       "      <td>Technology</td>\n",
       "      <td>Ryan Browne</td>\n",
       "      <td>When it comes to banking and financial service...</td>\n",
       "      <td>A.I. has a discrimination problem. In banking,...</td>\n",
       "      <td>cnbcnewsstory</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2023-06-22T23:43:01+0000</td>\n",
       "      <td>2023-06-23T01:01:10+0000</td>\n",
       "      <td>2023-06-23T01:01:10+0000</td>\n",
       "      <td>This report is from today's CNBC Daily Open, o...</td>\n",
       "      <td>Daily Open</td>\n",
       "      <td>Yeo Boon Ping</td>\n",
       "      <td>Investors have been lulled by a sense of secur...</td>\n",
       "      <td>CNBC Daily Open: Rate hikes and red lights</td>\n",
       "      <td>cnbcnewsstory</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              datePublished            cn:lastPubDate  \\\n",
       "0  2023-06-24T14:00:01+0000  2023-06-24T10:00:01+0000   \n",
       "1  2023-06-24T05:49:10+0000  2023-06-24T04:50:41+0000   \n",
       "2  2023-06-23T22:12:07+0000  2023-06-23T18:29:45+0000   \n",
       "3  2023-06-23T21:51:15+0000  2023-06-23T17:51:15+0000   \n",
       "4  2023-06-23T17:32:48+0000  2023-06-23T13:36:59+0000   \n",
       "5  2023-06-23T12:30:17+0000  2023-06-23T13:32:42+0000   \n",
       "6  2023-06-23T12:26:42+0000  2023-06-23T11:47:06+0000   \n",
       "7  2023-06-23T06:30:01+0000  2023-06-23T02:30:01+0000   \n",
       "8  2023-06-23T05:45:33+0000  2023-06-23T10:37:42+0000   \n",
       "9  2023-06-22T23:43:01+0000  2023-06-23T01:01:10+0000   \n",
       "\n",
       "               dateModified  \\\n",
       "0  2023-06-24T10:00:01+0000   \n",
       "1  2023-06-24T04:50:41+0000   \n",
       "2  2023-06-23T18:29:45+0000   \n",
       "3  2023-06-23T17:51:15+0000   \n",
       "4  2023-06-23T13:36:59+0000   \n",
       "5  2023-06-23T13:32:42+0000   \n",
       "6  2023-06-23T11:47:06+0000   \n",
       "7  2023-06-23T02:30:01+0000   \n",
       "8  2023-06-23T10:37:42+0000   \n",
       "9  2023-06-23T01:01:10+0000   \n",
       "\n",
       "                                         description  \\\n",
       "0  While Leah Ellis was earning her doctorate at ...   \n",
       "1  Amazon.com said on Friday it will take its inv...   \n",
       "2  As Microsoft attempts to convince regulators t...   \n",
       "3  The CEOs of Apple, Alphabet, Microsoft got a h...   \n",
       "4  Tech executives like Apple CEO Tim Cook visit ...   \n",
       "5  Anyone want to buy or sell this tech rally? To...   \n",
       "6  Here are Friday's biggest calls on Wall Street...   \n",
       "7  This report is from today's CNBC Daily Open, o...   \n",
       "8  AMSTERDAM — Artificial intelligence has a raci...   \n",
       "9  This report is from today's CNBC Daily Open, o...   \n",
       "\n",
       "                      section              author  \\\n",
       "0                  Clean Tech  Catherine Clifford   \n",
       "1                  Technology                       \n",
       "2                  Technology        Jordan Novet   \n",
       "3                  Technology          Seema Mody   \n",
       "4  Fast Money Halftime Report          Seema Mody   \n",
       "5         Pro: Pro Columnists          Bob Pisani   \n",
       "6    Pro: Analyst Stock Picks       Michael Bloom   \n",
       "7                  Daily Open       Yeo Boon Ping   \n",
       "8                  Technology         Ryan Browne   \n",
       "9                  Daily Open       Yeo Boon Ping   \n",
       "\n",
       "                                             summary  \\\n",
       "0  Sublime Systems is scaling up a green cement. ...   \n",
       "1  Modi and Jassy spoke about supporting Indian s...   \n",
       "2  Microsoft has been eager to grow in mobile gam...   \n",
       "3  Top tech execs met with Indian Prime Minister ...   \n",
       "4  Tech executives like Apple CEO Tim Cook visit ...   \n",
       "5  Following the rebalancing of S&P indexes last ...   \n",
       "6    Here are Friday's biggest calls on Wall Street.   \n",
       "7  Investors have been lulled by a sense of secur...   \n",
       "8  When it comes to banking and financial service...   \n",
       "9  Investors have been lulled by a sense of secur...   \n",
       "\n",
       "                                            cn:title        cn:type  \n",
       "0  Meet the 33-year-old Canadian chemist and the ...  cnbcnewsstory  \n",
       "1  Amazon raises investment in India to $26 billi...      wirestory  \n",
       "2  Microsoft says it looked at acquiring Zynga bu...  cnbcnewsstory  \n",
       "3  Apple's Tim Cook calls India 'huge opportunity...  cnbcnewsstory  \n",
       "4  Tech CEOs meet President Biden and Indian PM M...      cnbcvideo  \n",
       "5  Friday could offer a once-in-a-year chance to ...  cnbcnewsstory  \n",
       "6  Here are Friday's biggest analyst calls: Meta,...  cnbcnewsstory  \n",
       "7           CNBC Daily Open: Seeking shelter in tech  cnbcnewsstory  \n",
       "8  A.I. has a discrimination problem. In banking,...  cnbcnewsstory  \n",
       "9         CNBC Daily Open: Rate hikes and red lights  cnbcnewsstory  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"datePublished\", \"cn:lastPubDate\", \"dateModified\", \"description\", \"section\" ,\"author\", \"summary\" , \"cn:title\", \"cn:type\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Yicai / 第一财经"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.yicai_streaming import Yicai_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading ... 0 1 2 "
     ]
    }
   ],
   "source": [
    "news_downloader = Yicai_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"茅台\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(60, 13)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>channelid</th>\n",
       "      <th>creationDate</th>\n",
       "      <th>desc</th>\n",
       "      <th>id</th>\n",
       "      <th>previewImage</th>\n",
       "      <th>source</th>\n",
       "      <th>tags</th>\n",
       "      <th>title</th>\n",
       "      <th>topics</th>\n",
       "      <th>typeo</th>\n",
       "      <th>url</th>\n",
       "      <th>weight</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td></td>\n",
       "      <td>100000320</td>\n",
       "      <td>06-21 11:41</td>\n",
       "      <td></td>\n",
       "      <td>101788593</td>\n",
       "      <td></td>\n",
       "      <td>第一财经</td>\n",
       "      <td>北斗星通;游资;龙虎;买入;通信</td>\n",
       "      <td>机构抄底超讯通信  游资封板北斗星通丨龙虎榜</td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>/news/101788593.html</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[周艾琳]</td>\n",
       "      <td>53</td>\n",
       "      <td>06-20 21:55</td>\n",
       "      <td>2003年7月，第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票，受到了国内...</td>\n",
       "      <td>101788183</td>\n",
       "      <td>2023/06/e42c4bda8cc367f523764c90447ab5a3.jpg</td>\n",
       "      <td>第一财经</td>\n",
       "      <td>外资;A股;基金;QFII;RQFII;瑞银</td>\n",
       "      <td>QFII投资A股走过20年，外资驶向何方？</td>\n",
       "      <td></td>\n",
       "      <td>10</td>\n",
       "      <td>/news/101788183.html</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  author  channelid creationDate  \\\n",
       "0         100000320  06-21 11:41   \n",
       "1  [周艾琳]         53  06-20 21:55   \n",
       "\n",
       "                                                desc         id  \\\n",
       "0                                                     101788593   \n",
       "1  2003年7月，第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票，受到了国内...  101788183   \n",
       "\n",
       "                                   previewImage source  \\\n",
       "0                                                 第一财经   \n",
       "1  2023/06/e42c4bda8cc367f523764c90447ab5a3.jpg   第一财经   \n",
       "\n",
       "                     tags                   title topics  typeo  \\\n",
       "0        北斗星通;游资;龙虎;买入;通信  机构抄底超讯通信  游资封板北斗星通丨龙虎榜            10   \n",
       "1  外资;A股;基金;QFII;RQFII;瑞银   QFII投资A股走过20年，外资驶向何方？            10   \n",
       "\n",
       "                    url  weight  \n",
       "0  /news/101788593.html      50  \n",
       "1  /news/101788183.html      50  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>creationDate</th>\n",
       "      <th>desc</th>\n",
       "      <th>source</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td></td>\n",
       "      <td>06-21 11:41</td>\n",
       "      <td></td>\n",
       "      <td>第一财经</td>\n",
       "      <td>机构抄底超讯通信  游资封板北斗星通丨龙虎榜</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[周艾琳]</td>\n",
       "      <td>06-20 21:55</td>\n",
       "      <td>2003年7月，第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票，受到了国内...</td>\n",
       "      <td>第一财经</td>\n",
       "      <td>QFII投资A股走过20年，外资驶向何方？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td></td>\n",
       "      <td>06-20 11:46</td>\n",
       "      <td></td>\n",
       "      <td>第一财经</td>\n",
       "      <td>北向资金抄底贵州&lt;i&gt;茅台&lt;/i&gt; 游资联手封板中远海科丨龙虎榜</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td>06-20 11:45</td>\n",
       "      <td></td>\n",
       "      <td>第一财经</td>\n",
       "      <td>22股获北向资金加仓超亿元</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "      <td>06-20 11:36</td>\n",
       "      <td></td>\n",
       "      <td>第一财经</td>\n",
       "      <td>北向资金抄底贵州&lt;i&gt;茅台&lt;/i&gt; 游资联手封板中远海科丨龙虎榜</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td></td>\n",
       "      <td>06-20 06:23</td>\n",
       "      <td>第一财经每日早间精选热点新闻，点击「听新闻」，一键收听。</td>\n",
       "      <td>第一财经</td>\n",
       "      <td>布林肯结束访华，外交部美大司司长介绍情况；2023高考网上咨询周时间安排公布丨早报</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>[第一财经]</td>\n",
       "      <td>06-19 19:06</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>今日股市0619丨50大跌小票指数强势 分化局面会否延续？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td></td>\n",
       "      <td>06-19 19:05</td>\n",
       "      <td></td>\n",
       "      <td>第一财经</td>\n",
       "      <td>今日股市0619丨50大跌小票指数强势 分化局面会否延续？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>[一财资讯]</td>\n",
       "      <td>06-19 17:46</td>\n",
       "      <td>净买入额居前三的是贵州&lt;i&gt;茅台&lt;/i&gt;、药明康德、新易盛，分别获净买入3.48亿元、3.3...</td>\n",
       "      <td>第一财经</td>\n",
       "      <td>北向资金净卖出14.47亿元，贵州&lt;i&gt;茅台&lt;/i&gt;、药明康德等获加仓</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td></td>\n",
       "      <td>06-19 15:39</td>\n",
       "      <td></td>\n",
       "      <td>第一财经</td>\n",
       "      <td>三大指数小幅收跌 TMT赛道持续大涨｜尾市盘点</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   author creationDate                                               desc  \\\n",
       "0          06-21 11:41                                                      \n",
       "1   [周艾琳]  06-20 21:55  2003年7月，第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票，受到了国内...   \n",
       "2          06-20 11:46                                                      \n",
       "3          06-20 11:45                                                      \n",
       "4          06-20 11:36                                                      \n",
       "5          06-20 06:23                       第一财经每日早间精选热点新闻，点击「听新闻」，一键收听。   \n",
       "6  [第一财经]  06-19 19:06                                                      \n",
       "7          06-19 19:05                                                      \n",
       "8  [一财资讯]  06-19 17:46  净买入额居前三的是贵州<i>茅台</i>、药明康德、新易盛，分别获净买入3.48亿元、3.3...   \n",
       "9          06-19 15:39                                                      \n",
       "\n",
       "  source                                      title  \n",
       "0   第一财经                     机构抄底超讯通信  游资封板北斗星通丨龙虎榜  \n",
       "1   第一财经                      QFII投资A股走过20年，外资驶向何方？  \n",
       "2   第一财经           北向资金抄底贵州<i>茅台</i> 游资联手封板中远海科丨龙虎榜  \n",
       "3   第一财经                              22股获北向资金加仓超亿元  \n",
       "4   第一财经           北向资金抄底贵州<i>茅台</i> 游资联手封板中远海科丨龙虎榜  \n",
       "5   第一财经  布林肯结束访华，外交部美大司司长介绍情况；2023高考网上咨询周时间安排公布丨早报  \n",
       "6                     今日股市0619丨50大跌小票指数强势 分化局面会否延续？  \n",
       "7   第一财经              今日股市0619丨50大跌小票指数强势 分化局面会否延续？  \n",
       "8   第一财经       北向资金净卖出14.47亿元，贵州<i>茅台</i>、药明康德等获加仓   \n",
       "9   第一财经                    三大指数小幅收跌 TMT赛道持续大涨｜尾市盘点  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"author\", \"creationDate\", \"desc\" ,\"source\", \"title\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Investor Place"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.investorplace_streaming import InvestorPlace_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading ... 0 1 2 "
     ]
    }
   ],
   "source": [
    "news_downloader = InvestorPlace_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>time</th>\n",
       "      <th>author</th>\n",
       "      <th>summary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[Trillion-Dollar Tech: 3 Stocks Poised for Unp...</td>\n",
       "      <td>Jun 19, 2023</td>\n",
       "      <td>Faisal Humayun, InvestorPlace Contributor</td>\n",
       "      <td>These are the tech stocks to buy for multibagg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[Trillion-Dollar Tech: 3 Stocks Poised for Unp...</td>\n",
       "      <td>Jun 22, 2023</td>\n",
       "      <td>Chris MacDonald, InvestorPlace Contributor</td>\n",
       "      <td>Warren Buffett is undoubtedly one of the great...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[Invest Like a Billionaire: 3 Long-Term Stocks...</td>\n",
       "      <td>Jun 18, 2023</td>\n",
       "      <td>Joel Baglole, InvestorPlace Contributor</td>\n",
       "      <td>With markets now recovering from the downturn ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[Invest Like a Billionaire: 3 Long-Term Stocks...</td>\n",
       "      <td>Jun 16, 2023</td>\n",
       "      <td>Louis Navellier and the InvestorPlace Research...</td>\n",
       "      <td>The best tech stocks to watch are involved in ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[3 Tech Titans Leading the Charge Toward $10 T...</td>\n",
       "      <td>Jun 19, 2023</td>\n",
       "      <td>Will Ashworth, InvestorPlace Contributor</td>\n",
       "      <td>Avoiding bad stocks requires investors to get ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>[3 Tech Titans Leading the Charge Toward $10 T...</td>\n",
       "      <td>Jun 19, 2023</td>\n",
       "      <td>Tyrik Torres, InvestorPlace Contributor</td>\n",
       "      <td>While AI software companies tend to get more b...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>[7 Tech Stocks to Watch Out For in 2023 … and ...</td>\n",
       "      <td>Jun 16, 2023</td>\n",
       "      <td>Chris MacDonald, InvestorPlace Contributor</td>\n",
       "      <td>Many long-term conservative investors pay atte...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>[7 Tech Stocks to Watch Out For in 2023 … and ...</td>\n",
       "      <td>Jun 16, 2023</td>\n",
       "      <td>Louis Navellier and the InvestorPlace Research...</td>\n",
       "      <td>Every stock has its ups and downs, but reliabl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>[3 Smart Takes on 3 Dumb Stocks]</td>\n",
       "      <td>Jun 23, 2023</td>\n",
       "      <td>Samuel O'Brient, InvestorPlace Financial News ...</td>\n",
       "      <td>Even as tech stocks rally, short sellers are s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>[3 Smart Takes on 3 Dumb Stocks]</td>\n",
       "      <td>Jun 18, 2023</td>\n",
       "      <td>Chris Markoch, InvestorPlace Contributor</td>\n",
       "      <td>Here are seven high cash flow stocks that prov...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               title          time  \\\n",
       "0  [Trillion-Dollar Tech: 3 Stocks Poised for Unp...  Jun 19, 2023   \n",
       "1  [Trillion-Dollar Tech: 3 Stocks Poised for Unp...  Jun 22, 2023   \n",
       "2  [Invest Like a Billionaire: 3 Long-Term Stocks...  Jun 18, 2023   \n",
       "3  [Invest Like a Billionaire: 3 Long-Term Stocks...  Jun 16, 2023   \n",
       "4  [3 Tech Titans Leading the Charge Toward $10 T...  Jun 19, 2023   \n",
       "5  [3 Tech Titans Leading the Charge Toward $10 T...  Jun 19, 2023   \n",
       "6  [7 Tech Stocks to Watch Out For in 2023 … and ...  Jun 16, 2023   \n",
       "7  [7 Tech Stocks to Watch Out For in 2023 … and ...  Jun 16, 2023   \n",
       "8                   [3 Smart Takes on 3 Dumb Stocks]  Jun 23, 2023   \n",
       "9                   [3 Smart Takes on 3 Dumb Stocks]  Jun 18, 2023   \n",
       "\n",
       "                                              author  \\\n",
       "0          Faisal Humayun, InvestorPlace Contributor   \n",
       "1         Chris MacDonald, InvestorPlace Contributor   \n",
       "2            Joel Baglole, InvestorPlace Contributor   \n",
       "3  Louis Navellier and the InvestorPlace Research...   \n",
       "4           Will Ashworth, InvestorPlace Contributor   \n",
       "5            Tyrik Torres, InvestorPlace Contributor   \n",
       "6         Chris MacDonald, InvestorPlace Contributor   \n",
       "7  Louis Navellier and the InvestorPlace Research...   \n",
       "8  Samuel O'Brient, InvestorPlace Financial News ...   \n",
       "9           Chris Markoch, InvestorPlace Contributor   \n",
       "\n",
       "                                             summary  \n",
       "0  These are the tech stocks to buy for multibagg...  \n",
       "1  Warren Buffett is undoubtedly one of the great...  \n",
       "2  With markets now recovering from the downturn ...  \n",
       "3  The best tech stocks to watch are involved in ...  \n",
       "4  Avoiding bad stocks requires investors to get ...  \n",
       "5  While AI software companies tend to get more b...  \n",
       "6  Many long-term conservative investors pay atte...  \n",
       "7  Every stock has its ups and downs, but reliabl...  \n",
       "8  Even as tech stocks rally, short sellers are s...  \n",
       "9  Here are seven high cash flow stocks that prov...  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"title\", \"time\" ,\"author\", \"summary\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "### Guru Focus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.gurufocus_streaming import GuruFocus_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Only support first page now!\n"
     ]
    }
   ],
   "source": [
    "news_downloader = GuruFocus_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"AAPL\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>view</th>\n",
       "      <th>source</th>\n",
       "      <th>datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3 Magic Formula Stocks Popular With Gurus</td>\n",
       "      <td>0 Views</td>\n",
       "      <td>Margaret Moran</td>\n",
       "      <td>2023-06-23 17:38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jeremy Grantham: The Super Bubble Is About to Pop</td>\n",
       "      <td>60 Views</td>\n",
       "      <td>Ben Alaimo</td>\n",
       "      <td>2023-06-23 09:21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5 High GF Score Stocks That Outperformed the M...</td>\n",
       "      <td>106 Views</td>\n",
       "      <td>James Li</td>\n",
       "      <td>2023-06-21 19:36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>New Feature: DuPont Analysis Chart for Enhance...</td>\n",
       "      <td>259 Views</td>\n",
       "      <td>Vera Yuan</td>\n",
       "      <td>2023-06-21 16:55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The Most-Sold Guru Stocks of the 1st Quarter</td>\n",
       "      <td>261 Views</td>\n",
       "      <td>Margaret Moran</td>\n",
       "      <td>2023-06-16 17:32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>AI Revolution and Debt Ceiling Resolution</td>\n",
       "      <td>198 Views</td>\n",
       "      <td>Wade W. Slome, CFA, CFP</td>\n",
       "      <td>2023-06-05 21:03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Nvidia vs. ARK Invest: Which Is the Better Gro...</td>\n",
       "      <td>332 Views</td>\n",
       "      <td>Joey Frenette</td>\n",
       "      <td>2023-05-27 02:05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Top 5 1st Quarter Trades of CYPRESS ASSET MANA...</td>\n",
       "      <td>0 Views</td>\n",
       "      <td>GuruFocus Editor</td>\n",
       "      <td>2023-05-26 14:08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Mill Creek Capital Advisors, LLC Buys 2, Sells...</td>\n",
       "      <td>0 Views</td>\n",
       "      <td>GuruFocus Editor</td>\n",
       "      <td>2023-05-25 18:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Jim Simons' Renaissance Technologies Chops Pos...</td>\n",
       "      <td>380 Views</td>\n",
       "      <td>James Li</td>\n",
       "      <td>2023-05-24 18:43</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               title       view  \\\n",
       "0          3 Magic Formula Stocks Popular With Gurus    0 Views   \n",
       "1  Jeremy Grantham: The Super Bubble Is About to Pop   60 Views   \n",
       "2  5 High GF Score Stocks That Outperformed the M...  106 Views   \n",
       "3  New Feature: DuPont Analysis Chart for Enhance...  259 Views   \n",
       "4       The Most-Sold Guru Stocks of the 1st Quarter  261 Views   \n",
       "5          AI Revolution and Debt Ceiling Resolution  198 Views   \n",
       "6  Nvidia vs. ARK Invest: Which Is the Better Gro...  332 Views   \n",
       "7  Top 5 1st Quarter Trades of CYPRESS ASSET MANA...    0 Views   \n",
       "8  Mill Creek Capital Advisors, LLC Buys 2, Sells...    0 Views   \n",
       "9  Jim Simons' Renaissance Technologies Chops Pos...  380 Views   \n",
       "\n",
       "                    source          datetime  \n",
       "0           Margaret Moran  2023-06-23 17:38  \n",
       "1               Ben Alaimo  2023-06-23 09:21  \n",
       "2                 James Li  2023-06-21 19:36  \n",
       "3                Vera Yuan  2023-06-21 16:55  \n",
       "4           Margaret Moran  2023-06-16 17:32  \n",
       "5  Wade W. Slome, CFA, CFP  2023-06-05 21:03  \n",
       "6            Joey Frenette  2023-05-27 02:05  \n",
       "7         GuruFocus Editor  2023-05-26 14:08  \n",
       "8         GuruFocus Editor  2023-05-25 18:10  \n",
       "9                 James Li  2023-05-24 18:43  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"title\", \"view\" ,\"source\", \"datetime\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Alliance News"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.alliancenews_streaming import AllianceNews_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "news_downloader = AllianceNews_Streaming()\n",
    "news_downloader.download_streaming_search(rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(36, 16)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>urlId</th>\n",
       "      <th>title</th>\n",
       "      <th>summary</th>\n",
       "      <th>created</th>\n",
       "      <th>updated</th>\n",
       "      <th>thumbnailUrl</th>\n",
       "      <th>source</th>\n",
       "      <th>taxonomies</th>\n",
       "      <th>type</th>\n",
       "      <th>author</th>\n",
       "      <th>meta</th>\n",
       "      <th>sponsor</th>\n",
       "      <th>parent</th>\n",
       "      <th>contentId</th>\n",
       "      <th>displayTaxonomies</th>\n",
       "      <th>parentTaxonomy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>/news/new-york-market-close-stocks-down-dollar...</td>\n",
       "      <td>NEW YORK MARKET CLOSE: Stocks down, dollar up ...</td>\n",
       "      <td>None</td>\n",
       "      <td>2023-06-23T21:18:34</td>\n",
       "      <td>2023-06-23T21:18:34</td>\n",
       "      <td>None</td>\n",
       "      <td>{'code': 'ALLIANCE', 'title': 'Alliance News',...</td>\n",
       "      <td>[{'termId': 'CTMRR', 'parentTermId': None, 'ti...</td>\n",
       "      <td>news</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'NEW YORK MARKET CLOSE: Stocks down,...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>al1687551514259519100</td>\n",
       "      <td>[{'termId': 'TPCOM', 'parentTermId': 'PTMKT', ...</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               urlId  \\\n",
       "0  /news/new-york-market-close-stocks-down-dollar...   \n",
       "\n",
       "                                               title summary  \\\n",
       "0  NEW YORK MARKET CLOSE: Stocks down, dollar up ...    None   \n",
       "\n",
       "               created              updated thumbnailUrl  \\\n",
       "0  2023-06-23T21:18:34  2023-06-23T21:18:34         None   \n",
       "\n",
       "                                              source  \\\n",
       "0  {'code': 'ALLIANCE', 'title': 'Alliance News',...   \n",
       "\n",
       "                                          taxonomies  type author  \\\n",
       "0  [{'termId': 'CTMRR', 'parentTermId': None, 'ti...  news   None   \n",
       "\n",
       "                                                meta sponsor parent  \\\n",
       "0  {'title': 'NEW YORK MARKET CLOSE: Stocks down,...    None   None   \n",
       "\n",
       "               contentId                                  displayTaxonomies  \\\n",
       "0  al1687551514259519100  [{'termId': 'TPCOM', 'parentTermId': 'PTMKT', ...   \n",
       "\n",
       "  parentTaxonomy  \n",
       "0           None  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>created</th>\n",
       "      <th>updated</th>\n",
       "      <th>title</th>\n",
       "      <th>summary</th>\n",
       "      <th>meta</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2023-06-23T21:18:34</td>\n",
       "      <td>2023-06-23T21:18:34</td>\n",
       "      <td>NEW YORK MARKET CLOSE: Stocks down, dollar up ...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'NEW YORK MARKET CLOSE: Stocks down,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2023-06-23T19:34:05</td>\n",
       "      <td>2023-06-23T19:34:05</td>\n",
       "      <td>IN BRIEF: Blackstone Loan Financing proposes w...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'IN BRIEF: Blackstone Loan Financing...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2023-06-23T18:34:41</td>\n",
       "      <td>2023-06-23T18:34:41</td>\n",
       "      <td>IN BRIEF: Bonhill expects to complete sale of ...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'IN BRIEF: Bonhill expects to comple...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2023-06-23T18:01:27</td>\n",
       "      <td>2023-06-23T18:01:27</td>\n",
       "      <td>UPDATE: SRT Marine Systems raises GBP4.6 milli...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'UPDATE: SRT Marine Systems raises G...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2023-06-23T18:00:27</td>\n",
       "      <td>2023-06-23T18:00:27</td>\n",
       "      <td>IN BRIEF: New Energy One Acquisition confirms ...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'IN BRIEF: New Energy One Acquisitio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2023-06-23T17:41:15</td>\n",
       "      <td>2023-06-23T17:41:15</td>\n",
       "      <td>IN BRIEF: Kropz makes draw down request on bri...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'IN BRIEF: Kropz makes draw down req...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2023-06-23T17:31:17</td>\n",
       "      <td>2023-06-23T17:31:17</td>\n",
       "      <td>IN BRIEF: XPS Pensions discusses National Pens...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'IN BRIEF: XPS Pensions discusses Na...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2023-06-23T17:25:54</td>\n",
       "      <td>2023-06-23T17:25:54</td>\n",
       "      <td>DIRECTOR DEALINGS: GSK CFO buys shares worth G...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'DIRECTOR DEALINGS: GSK CFO buys sha...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2023-06-23T17:21:29</td>\n",
       "      <td>2023-06-23T17:21:29</td>\n",
       "      <td>IN BRIEF: Gilead Sciences says test results sh...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'IN BRIEF: Gilead Sciences says test...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2023-06-23T17:07:24</td>\n",
       "      <td>2023-06-23T17:07:24</td>\n",
       "      <td>IN THE KNOW: AB Foods \"fundamentally strong\" w...</td>\n",
       "      <td>None</td>\n",
       "      <td>{'title': 'IN THE KNOW: AB Foods \"fundamentall...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               created              updated  \\\n",
       "0  2023-06-23T21:18:34  2023-06-23T21:18:34   \n",
       "1  2023-06-23T19:34:05  2023-06-23T19:34:05   \n",
       "2  2023-06-23T18:34:41  2023-06-23T18:34:41   \n",
       "3  2023-06-23T18:01:27  2023-06-23T18:01:27   \n",
       "4  2023-06-23T18:00:27  2023-06-23T18:00:27   \n",
       "5  2023-06-23T17:41:15  2023-06-23T17:41:15   \n",
       "6  2023-06-23T17:31:17  2023-06-23T17:31:17   \n",
       "7  2023-06-23T17:25:54  2023-06-23T17:25:54   \n",
       "8  2023-06-23T17:21:29  2023-06-23T17:21:29   \n",
       "9  2023-06-23T17:07:24  2023-06-23T17:07:24   \n",
       "\n",
       "                                               title summary  \\\n",
       "0  NEW YORK MARKET CLOSE: Stocks down, dollar up ...    None   \n",
       "1  IN BRIEF: Blackstone Loan Financing proposes w...    None   \n",
       "2  IN BRIEF: Bonhill expects to complete sale of ...    None   \n",
       "3  UPDATE: SRT Marine Systems raises GBP4.6 milli...    None   \n",
       "4  IN BRIEF: New Energy One Acquisition confirms ...    None   \n",
       "5  IN BRIEF: Kropz makes draw down request on bri...    None   \n",
       "6  IN BRIEF: XPS Pensions discusses National Pens...    None   \n",
       "7  DIRECTOR DEALINGS: GSK CFO buys shares worth G...    None   \n",
       "8  IN BRIEF: Gilead Sciences says test results sh...    None   \n",
       "9  IN THE KNOW: AB Foods \"fundamentally strong\" w...    None   \n",
       "\n",
       "                                                meta  \n",
       "0  {'title': 'NEW YORK MARKET CLOSE: Stocks down,...  \n",
       "1  {'title': 'IN BRIEF: Blackstone Loan Financing...  \n",
       "2  {'title': 'IN BRIEF: Bonhill expects to comple...  \n",
       "3  {'title': 'UPDATE: SRT Marine Systems raises G...  \n",
       "4  {'title': 'IN BRIEF: New Energy One Acquisitio...  \n",
       "5  {'title': 'IN BRIEF: Kropz makes draw down req...  \n",
       "6  {'title': 'IN BRIEF: XPS Pensions discusses Na...  \n",
       "7  {'title': 'DIRECTOR DEALINGS: GSK CFO buys sha...  \n",
       "8  {'title': 'IN BRIEF: Gilead Sciences says test...  \n",
       "9  {'title': 'IN THE KNOW: AB Foods \"fundamentall...  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"created\", \"updated\", \"title\", \"summary\", \"meta\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Talk Market"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.talkmarkets_streaming import TalkMarkets_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading... 0 1 2 "
     ]
    }
   ],
   "source": [
    "news_downloader = TalkMarkets_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(60, 12)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cacheUrl</th>\n",
       "      <th>clicktrackUrl</th>\n",
       "      <th>content</th>\n",
       "      <th>contentNoFormatting</th>\n",
       "      <th>title</th>\n",
       "      <th>titleNoFormatting</th>\n",
       "      <th>formattedUrl</th>\n",
       "      <th>unescapedUrl</th>\n",
       "      <th>url</th>\n",
       "      <th>visibleUrl</th>\n",
       "      <th>richSnippet</th>\n",
       "      <th>breadcrumbUrl</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>http://www.google.com/search?q=cache:PUjyIRJA8...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "      <td>23 hours ago &lt;b&gt;...&lt;/b&gt; &lt;b&gt;Apple&lt;/b&gt;, 187, 3.0...</td>\n",
       "      <td>23 hours ago ... Apple, 187, 3.04, 1.65%, 187....</td>\n",
       "      <td>Equitymaster India | Sensex Today Trades Lower...</td>\n",
       "      <td>Equitymaster India | Sensex Today Trades Lower...</td>\n",
       "      <td>https://talkmarkets.com/.../sensex-today-trade...</td>\n",
       "      <td>https://talkmarkets.com/content/global-markets...</td>\n",
       "      <td>https://talkmarkets.com/content/global-markets...</td>\n",
       "      <td>talkmarkets.com</td>\n",
       "      <td>{'cseImage': {'src': 'https://www.eqimg.com/im...</td>\n",
       "      <td>{'host': 'talkmarkets.com', 'crumbs': ['sensex...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            cacheUrl  \\\n",
       "0  http://www.google.com/search?q=cache:PUjyIRJA8...   \n",
       "\n",
       "                                       clicktrackUrl  \\\n",
       "0  https://www.google.com/url?client=internal-ele...   \n",
       "\n",
       "                                             content  \\\n",
       "0  23 hours ago <b>...</b> <b>Apple</b>, 187, 3.0...   \n",
       "\n",
       "                                 contentNoFormatting  \\\n",
       "0  23 hours ago ... Apple, 187, 3.04, 1.65%, 187....   \n",
       "\n",
       "                                               title  \\\n",
       "0  Equitymaster India | Sensex Today Trades Lower...   \n",
       "\n",
       "                                   titleNoFormatting  \\\n",
       "0  Equitymaster India | Sensex Today Trades Lower...   \n",
       "\n",
       "                                        formattedUrl  \\\n",
       "0  https://talkmarkets.com/.../sensex-today-trade...   \n",
       "\n",
       "                                        unescapedUrl  \\\n",
       "0  https://talkmarkets.com/content/global-markets...   \n",
       "\n",
       "                                                 url       visibleUrl  \\\n",
       "0  https://talkmarkets.com/content/global-markets...  talkmarkets.com   \n",
       "\n",
       "                                         richSnippet  \\\n",
       "0  {'cseImage': {'src': 'https://www.eqimg.com/im...   \n",
       "\n",
       "                                       breadcrumbUrl  \n",
       "0  {'host': 'talkmarkets.com', 'crumbs': ['sensex...  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content</th>\n",
       "      <th>url</th>\n",
       "      <th>clicktrackUrl</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>23 hours ago &lt;b&gt;...&lt;/b&gt; &lt;b&gt;Apple&lt;/b&gt;, 187, 3.0...</td>\n",
       "      <td>https://talkmarkets.com/content/global-markets...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1 day ago &lt;b&gt;...&lt;/b&gt; Get Adobe Inc. (ADBE:NASD...</td>\n",
       "      <td>https://talkmarkets.com/symbol/adbe/portal-wid...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1 day ago &lt;b&gt;...&lt;/b&gt; Get Starbucks Corp (SBUX:...</td>\n",
       "      <td>https://talkmarkets.com/symbol/sbux/portal-wid...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10 hours ago &lt;b&gt;...&lt;/b&gt; Wednesday&amp;#39;s top an...</td>\n",
       "      <td>https://talkmarkets.com/symbol/pypl/portal-wid...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20 hours ago &lt;b&gt;...&lt;/b&gt; &lt;b&gt;Apple&lt;/b&gt; (AAPL). &lt;...</td>\n",
       "      <td>https://talkmarkets.com/content/stocks--equiti...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2 days ago &lt;b&gt;...&lt;/b&gt; Friday&amp;#39;s top analyst...</td>\n",
       "      <td>https://talkmarkets.com/symbol/sofi/portal-wid...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2 days ago &lt;b&gt;...&lt;/b&gt; Get Enphase Energy Inc (...</td>\n",
       "      <td>https://talkmarkets.com/symbol/enph/portal-wid...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3 days ago &lt;b&gt;...&lt;/b&gt; &lt;b&gt;Apple&lt;/b&gt; Inc. design...</td>\n",
       "      <td>https://talkmarkets.com/contributor/jimvanmeer...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>3 days ago &lt;b&gt;...&lt;/b&gt; &lt;b&gt;Apple&lt;/b&gt; Inc. design...</td>\n",
       "      <td>https://talkmarkets.com/content/stocks--equiti...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>5 Jan 2023 &lt;b&gt;...&lt;/b&gt; Get Amazon.com Inc (AMZN...</td>\n",
       "      <td>https://talkmarkets.com/symbol/amzn/portal-wid...</td>\n",
       "      <td>https://www.google.com/url?client=internal-ele...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             content  \\\n",
       "0  23 hours ago <b>...</b> <b>Apple</b>, 187, 3.0...   \n",
       "1  1 day ago <b>...</b> Get Adobe Inc. (ADBE:NASD...   \n",
       "2  1 day ago <b>...</b> Get Starbucks Corp (SBUX:...   \n",
       "3  10 hours ago <b>...</b> Wednesday&#39;s top an...   \n",
       "4  20 hours ago <b>...</b> <b>Apple</b> (AAPL). <...   \n",
       "5  2 days ago <b>...</b> Friday&#39;s top analyst...   \n",
       "6  2 days ago <b>...</b> Get Enphase Energy Inc (...   \n",
       "7  3 days ago <b>...</b> <b>Apple</b> Inc. design...   \n",
       "8  3 days ago <b>...</b> <b>Apple</b> Inc. design...   \n",
       "9  5 Jan 2023 <b>...</b> Get Amazon.com Inc (AMZN...   \n",
       "\n",
       "                                                 url  \\\n",
       "0  https://talkmarkets.com/content/global-markets...   \n",
       "1  https://talkmarkets.com/symbol/adbe/portal-wid...   \n",
       "2  https://talkmarkets.com/symbol/sbux/portal-wid...   \n",
       "3  https://talkmarkets.com/symbol/pypl/portal-wid...   \n",
       "4  https://talkmarkets.com/content/stocks--equiti...   \n",
       "5  https://talkmarkets.com/symbol/sofi/portal-wid...   \n",
       "6  https://talkmarkets.com/symbol/enph/portal-wid...   \n",
       "7  https://talkmarkets.com/contributor/jimvanmeer...   \n",
       "8  https://talkmarkets.com/content/stocks--equiti...   \n",
       "9  https://talkmarkets.com/symbol/amzn/portal-wid...   \n",
       "\n",
       "                                       clicktrackUrl  \n",
       "0  https://www.google.com/url?client=internal-ele...  \n",
       "1  https://www.google.com/url?client=internal-ele...  \n",
       "2  https://www.google.com/url?client=internal-ele...  \n",
       "3  https://www.google.com/url?client=internal-ele...  \n",
       "4  https://www.google.com/url?client=internal-ele...  \n",
       "5  https://www.google.com/url?client=internal-ele...  \n",
       "6  https://www.google.com/url?client=internal-ele...  \n",
       "7  https://www.google.com/url?client=internal-ele...  \n",
       "8  https://www.google.com/url?client=internal-ele...  \n",
       "9  https://www.google.com/url?client=internal-ele...  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"content\", \"url\", \"clicktrackUrl\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The Fly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.thefly_streaming import TheFly_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\ProgramData\\Anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'thefly.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Only support the first page now!\n"
     ]
    }
   ],
   "source": [
    "news_downloader = TheFly_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"AAPL\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>stock</th>\n",
       "      <th>abstract</th>\n",
       "      <th>date</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Apple in talks to launch Apple Card in India, ...</td>\n",
       "      <td>AAPL</td>\n",
       "      <td>Apple is in talks to…</td>\n",
       "      <td>06/23/23</td>\n",
       "      <td>05:37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Apple says visionOS software development kit n...</td>\n",
       "      <td>AAPL</td>\n",
       "      <td>Apple announced the…</td>\n",
       "      <td>06/21/23</td>\n",
       "      <td>16:03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Apple to create spatial experiences for Apple ...</td>\n",
       "      <td>AAPL</td>\n",
       "      <td>Apple \"announced the…</td>\n",
       "      <td>06/21/23</td>\n",
       "      <td>16:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Notable open interest changes for June 21st</td>\n",
       "      <td>TSLA NVDA AAPL AMZN</td>\n",
       "      <td>Tuesday's total…</td>\n",
       "      <td>06/21/23</td>\n",
       "      <td>08:55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>What You Missed This Week in Video Games</td>\n",
       "      <td>TCEHY TTWO EA CCOEY UBSFY CMCSK CMCSA RBLX AAP...</td>\n",
       "      <td>\"Game On\" is The Fly's…</td>\n",
       "      <td>06/20/23</td>\n",
       "      <td>12:11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Notable open interest changes for June 20th</td>\n",
       "      <td>TSLA AMC AAPL NVDA</td>\n",
       "      <td>Friday's total…</td>\n",
       "      <td>06/20/23</td>\n",
       "      <td>08:55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Apple call buyer realizes 20% same-day gains</td>\n",
       "      <td>AAPL</td>\n",
       "      <td>Notable profits for the…</td>\n",
       "      <td>06/16/23</td>\n",
       "      <td>08:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Notable open interest changes for June 15th</td>\n",
       "      <td>TSLA NVDA SOFI AAPL</td>\n",
       "      <td>Wednesday's total…</td>\n",
       "      <td>06/15/23</td>\n",
       "      <td>08:55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>US senators propose bill to eliminate Section ...</td>\n",
       "      <td>GOOG MSFT AMZN AAPL NVDA IBM META INTC</td>\n",
       "      <td>\"U.S. Senators Josh…</td>\n",
       "      <td>06/14/23</td>\n",
       "      <td>17:54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>#SocialStocks: Twitter skips out on rent and G...</td>\n",
       "      <td>TWTR AAPL META GOOGL GOOG ZM RBLX PINS</td>\n",
       "      <td>Welcome to…</td>\n",
       "      <td>06/14/23</td>\n",
       "      <td>15:57</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               title  ...   time\n",
       "0  Apple in talks to launch Apple Card in India, ...  ...  05:37\n",
       "1  Apple says visionOS software development kit n...  ...  16:03\n",
       "2  Apple to create spatial experiences for Apple ...  ...  16:00\n",
       "3        Notable open interest changes for June 21st  ...  08:55\n",
       "4           What You Missed This Week in Video Games  ...  12:11\n",
       "5        Notable open interest changes for June 20th  ...  08:55\n",
       "6       Apple call buyer realizes 20% same-day gains  ...  08:00\n",
       "7        Notable open interest changes for June 15th  ...  08:55\n",
       "8  US senators propose bill to eliminate Section ...  ...  17:54\n",
       "9  #SocialStocks: Twitter skips out on rent and G...  ...  15:57\n",
       "\n",
       "[10 rows x 5 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"title\", \"stock\", \"abstract\", \"date\", \"time\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tip Rank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.tipranks_streaming import TipRanks_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading: 0 1 2 "
     ]
    }
   ],
   "source": [
    "news_downloader = TipRanks_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stocks</th>\n",
       "      <th>_id</th>\n",
       "      <th>author</th>\n",
       "      <th>category</th>\n",
       "      <th>date</th>\n",
       "      <th>description</th>\n",
       "      <th>image</th>\n",
       "      <th>isLocked</th>\n",
       "      <th>link</th>\n",
       "      <th>lockType</th>\n",
       "      <th>slug</th>\n",
       "      <th>sticky</th>\n",
       "      <th>thumbnail</th>\n",
       "      <th>title</th>\n",
       "      <th>topics</th>\n",
       "      <th>timeAgo</th>\n",
       "      <th>badge</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}]</td>\n",
       "      <td>582530</td>\n",
       "      <td>{'slug': 'steveanderson'}</td>\n",
       "      <td>{'slug': 'news', 'title': 'Market News'}</td>\n",
       "      <td>2023-06-22T19:52:41.000Z</td>\n",
       "      <td>&lt;p&gt;Those who regularly follow Apple stock (NAS...</td>\n",
       "      <td>{'src': 'https://blog.tipranks.com/wp-content/...</td>\n",
       "      <td>True</td>\n",
       "      <td>https://www.tipranks.com/news/aapl-notches-up-...</td>\n",
       "      <td>GraceCount</td>\n",
       "      <td>aapl-notches-up-following-barclays-comments</td>\n",
       "      <td>False</td>\n",
       "      <td>{'src': 'https://blog.tipranks.com/wp-content/...</td>\n",
       "      <td>AAPL Notches Up Following Barclays Comments</td>\n",
       "      <td>[{'id': 0, 'type': 'stock', 'title': 'AAPL', '...</td>\n",
       "      <td>13h</td>\n",
       "      <td>None</td>\n",
       "      <td>582530</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}, {'ticker'...</td>\n",
       "      <td>579043</td>\n",
       "      <td>{'slug': 'amit-singh'}</td>\n",
       "      <td>{'slug': 'article', 'title': 'Stock Analysis &amp;...</td>\n",
       "      <td>2023-06-19T15:30:38.000Z</td>\n",
       "      <td>&lt;p&gt;Affirm Holdings (NASDAQ:AFRM) stock recover...</td>\n",
       "      <td>{'src': 'https://blog.tipranks.com/wp-content/...</td>\n",
       "      <td>True</td>\n",
       "      <td>https://www.tipranks.com/news/article/affirm-s...</td>\n",
       "      <td>GraceCount</td>\n",
       "      <td>affirm-stock-has-risen-swiftly-will-it-beat-ap...</td>\n",
       "      <td>False</td>\n",
       "      <td>{'src': 'https://blog.tipranks.com/wp-content/...</td>\n",
       "      <td>Affirm Stock Has Risen Swiftly. Will It Beat A...</td>\n",
       "      <td>[{'id': 0, 'type': 'stock', 'title': 'AAPL', '...</td>\n",
       "      <td>4d</td>\n",
       "      <td>None</td>\n",
       "      <td>579043</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              stocks     _id  \\\n",
       "0               [{'ticker': 'AAPL', 'market': None}]  582530   \n",
       "1  [{'ticker': 'AAPL', 'market': None}, {'ticker'...  579043   \n",
       "\n",
       "                      author  \\\n",
       "0  {'slug': 'steveanderson'}   \n",
       "1     {'slug': 'amit-singh'}   \n",
       "\n",
       "                                            category  \\\n",
       "0           {'slug': 'news', 'title': 'Market News'}   \n",
       "1  {'slug': 'article', 'title': 'Stock Analysis &...   \n",
       "\n",
       "                       date  \\\n",
       "0  2023-06-22T19:52:41.000Z   \n",
       "1  2023-06-19T15:30:38.000Z   \n",
       "\n",
       "                                         description  \\\n",
       "0  <p>Those who regularly follow Apple stock (NAS...   \n",
       "1  <p>Affirm Holdings (NASDAQ:AFRM) stock recover...   \n",
       "\n",
       "                                               image  isLocked  \\\n",
       "0  {'src': 'https://blog.tipranks.com/wp-content/...      True   \n",
       "1  {'src': 'https://blog.tipranks.com/wp-content/...      True   \n",
       "\n",
       "                                                link    lockType  \\\n",
       "0  https://www.tipranks.com/news/aapl-notches-up-...  GraceCount   \n",
       "1  https://www.tipranks.com/news/article/affirm-s...  GraceCount   \n",
       "\n",
       "                                                slug  sticky  \\\n",
       "0        aapl-notches-up-following-barclays-comments   False   \n",
       "1  affirm-stock-has-risen-swiftly-will-it-beat-ap...   False   \n",
       "\n",
       "                                           thumbnail  \\\n",
       "0  {'src': 'https://blog.tipranks.com/wp-content/...   \n",
       "1  {'src': 'https://blog.tipranks.com/wp-content/...   \n",
       "\n",
       "                                               title  \\\n",
       "0        AAPL Notches Up Following Barclays Comments   \n",
       "1  Affirm Stock Has Risen Swiftly. Will It Beat A...   \n",
       "\n",
       "                                              topics timeAgo badge      id  \n",
       "0  [{'id': 0, 'type': 'stock', 'title': 'AAPL', '...     13h  None  582530  \n",
       "1  [{'id': 0, 'type': 'stock', 'title': 'AAPL', '...      4d  None  579043  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_downloader.dataframe.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stocks</th>\n",
       "      <th>date</th>\n",
       "      <th>author</th>\n",
       "      <th>title</th>\n",
       "      <th>description</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}]</td>\n",
       "      <td>2023-06-22T19:52:41.000Z</td>\n",
       "      <td>{'slug': 'steveanderson'}</td>\n",
       "      <td>AAPL Notches Up Following Barclays Comments</td>\n",
       "      <td>&lt;p&gt;Those who regularly follow Apple stock (NAS...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}, {'ticker'...</td>\n",
       "      <td>2023-06-19T15:30:38.000Z</td>\n",
       "      <td>{'slug': 'amit-singh'}</td>\n",
       "      <td>Affirm Stock Has Risen Swiftly. Will It Beat A...</td>\n",
       "      <td>&lt;p&gt;Affirm Holdings (NASDAQ:AFRM) stock recover...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}, {'ticker'...</td>\n",
       "      <td>2023-06-16T20:51:11.000Z</td>\n",
       "      <td>{'slug': 'joey-frenette'}</td>\n",
       "      <td>Apple Stock (NASDAQ:AAPL): Expectations Too Mo...</td>\n",
       "      <td>&lt;p&gt;Apple (NASDAQ:AAPL) stock recently hit a ne...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}, {'ticker'...</td>\n",
       "      <td>2023-06-16T08:32:40.000Z</td>\n",
       "      <td>{'slug': 'sheryl-sheth'}</td>\n",
       "      <td>Lost the Nvidia and Apple Boom? Microsoft (NAS...</td>\n",
       "      <td>&lt;p&gt;Think you lost the chance to become wealthy...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}, {'ticker'...</td>\n",
       "      <td>2023-06-15T19:20:48.000Z</td>\n",
       "      <td>{'slug': 'joey-frenette'}</td>\n",
       "      <td>Unity Software (NASDAQ:U): Apple Vision Pro Pa...</td>\n",
       "      <td>&lt;p&gt;Unity Software (NASDAQ:U) rallied 17% when ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}, {'ticker'...</td>\n",
       "      <td>2023-06-15T00:53:16.000Z</td>\n",
       "      <td>{'slug': 'joey-frenette'}</td>\n",
       "      <td>Investing in Apple’s (NASDAQ:AAPL) Ecosystem: ...</td>\n",
       "      <td>&lt;p&gt;Apple (NASDAQ:AAPL) has been on an unbeliev...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}]</td>\n",
       "      <td>2023-06-13T15:13:40.000Z</td>\n",
       "      <td>{'slug': 'vince-condarcuri'}</td>\n",
       "      <td>AAPL Stock Slips after Analyst Downgrade</td>\n",
       "      <td>&lt;p&gt;Despite outperforming the S&amp;amp;P 500 with ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}]</td>\n",
       "      <td>2023-06-10T15:09:23.000Z</td>\n",
       "      <td>{'slug': 'martyshtrubel'}</td>\n",
       "      <td>Apple Stock Gets a New Street-High Price Target</td>\n",
       "      <td>&lt;p&gt;Apple’s (NASDAQ:AAPL) virtual reality and a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}, {'ticker'...</td>\n",
       "      <td>2023-06-09T09:46:28.000Z</td>\n",
       "      <td>{'slug': 'amit-singh'}</td>\n",
       "      <td>NVDA to META: Insiders Capitalise on Tech Stoc...</td>\n",
       "      <td>&lt;p&gt;Technology stocks rebounded strongly in 202...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>[{'ticker': 'AAPL', 'market': None}, {'ticker'...</td>\n",
       "      <td>2023-06-08T18:20:20.000Z</td>\n",
       "      <td>{'slug': 'michaelbyrne'}</td>\n",
       "      <td>Apple Stock is on Fire. Invest in it with Thes...</td>\n",
       "      <td>&lt;p&gt;Apple (NASDAQ:AAPL) stock is off to a gain ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              stocks  \\\n",
       "0               [{'ticker': 'AAPL', 'market': None}]   \n",
       "1  [{'ticker': 'AAPL', 'market': None}, {'ticker'...   \n",
       "2  [{'ticker': 'AAPL', 'market': None}, {'ticker'...   \n",
       "3  [{'ticker': 'AAPL', 'market': None}, {'ticker'...   \n",
       "4  [{'ticker': 'AAPL', 'market': None}, {'ticker'...   \n",
       "5  [{'ticker': 'AAPL', 'market': None}, {'ticker'...   \n",
       "6               [{'ticker': 'AAPL', 'market': None}]   \n",
       "7               [{'ticker': 'AAPL', 'market': None}]   \n",
       "8  [{'ticker': 'AAPL', 'market': None}, {'ticker'...   \n",
       "9  [{'ticker': 'AAPL', 'market': None}, {'ticker'...   \n",
       "\n",
       "                       date                        author  \\\n",
       "0  2023-06-22T19:52:41.000Z     {'slug': 'steveanderson'}   \n",
       "1  2023-06-19T15:30:38.000Z        {'slug': 'amit-singh'}   \n",
       "2  2023-06-16T20:51:11.000Z     {'slug': 'joey-frenette'}   \n",
       "3  2023-06-16T08:32:40.000Z      {'slug': 'sheryl-sheth'}   \n",
       "4  2023-06-15T19:20:48.000Z     {'slug': 'joey-frenette'}   \n",
       "5  2023-06-15T00:53:16.000Z     {'slug': 'joey-frenette'}   \n",
       "6  2023-06-13T15:13:40.000Z  {'slug': 'vince-condarcuri'}   \n",
       "7  2023-06-10T15:09:23.000Z     {'slug': 'martyshtrubel'}   \n",
       "8  2023-06-09T09:46:28.000Z        {'slug': 'amit-singh'}   \n",
       "9  2023-06-08T18:20:20.000Z      {'slug': 'michaelbyrne'}   \n",
       "\n",
       "                                               title  \\\n",
       "0        AAPL Notches Up Following Barclays Comments   \n",
       "1  Affirm Stock Has Risen Swiftly. Will It Beat A...   \n",
       "2  Apple Stock (NASDAQ:AAPL): Expectations Too Mo...   \n",
       "3  Lost the Nvidia and Apple Boom? Microsoft (NAS...   \n",
       "4  Unity Software (NASDAQ:U): Apple Vision Pro Pa...   \n",
       "5  Investing in Apple’s (NASDAQ:AAPL) Ecosystem: ...   \n",
       "6           AAPL Stock Slips after Analyst Downgrade   \n",
       "7    Apple Stock Gets a New Street-High Price Target   \n",
       "8  NVDA to META: Insiders Capitalise on Tech Stoc...   \n",
       "9  Apple Stock is on Fire. Invest in it with Thes...   \n",
       "\n",
       "                                         description  \n",
       "0  <p>Those who regularly follow Apple stock (NAS...  \n",
       "1  <p>Affirm Holdings (NASDAQ:AFRM) stock recover...  \n",
       "2  <p>Apple (NASDAQ:AAPL) stock recently hit a ne...  \n",
       "3  <p>Think you lost the chance to become wealthy...  \n",
       "4  <p>Unity Software (NASDAQ:U) rallied 17% when ...  \n",
       "5  <p>Apple (NASDAQ:AAPL) has been on an unbeliev...  \n",
       "6  <p>Despite outperforming the S&amp;P 500 with ...  \n",
       "7  <p>Apple’s (NASDAQ:AAPL) virtual reality and a...  \n",
       "8  <p>Technology stocks rebounded strongly in 202...  \n",
       "9  <p>Apple (NASDAQ:AAPL) stock is off to a gain ...  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"stocks\", \"date\", \"author\", \"title\", \"description\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Market Watch (Date Range)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.marketwatch_date_range import MarketWatch_Date_Range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_date = \"2022-06-01\"\n",
    "end_date = \"2022-06-30\"\n",
    "keyword = \"apple\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Only support the first page now!\n"
     ]
    }
   ],
   "source": [
    "news_downloader = MarketWatch_Date_Range()\n",
    "news_downloader.download_date_range_search(keyword = \"apple\", start_date = start_date, end_date = end_date)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>time</th>\n",
       "      <th>author</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Gold falls more than 2% for the month, settles...</td>\n",
       "      <td>Jun. 30, 2022 at 2:47 p.m. ET</td>\n",
       "      <td>by Joseph Adinolfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AMD stock gets an upgrade as analyst says rece...</td>\n",
       "      <td>Jun. 30, 2022 at 12:07 p.m. ET</td>\n",
       "      <td>by Emily Bary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>All 30 Dow stocks are falling, with Goldman Sa...</td>\n",
       "      <td>Jun. 30, 2022 at 9:47 a.m. ET</td>\n",
       "      <td>by Tomi Kilgore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Here’s how far oil could fall in a recession, ...</td>\n",
       "      <td>Jun. 30, 2022 at 8:34 a.m. ET</td>\n",
       "      <td>by Steve Goldstein</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Crypto Winter Is Coming After SEC Rejects Key ...</td>\n",
       "      <td>Jun. 30, 2022 at 6:37 a.m. ET</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>An FCC Commissioner Wants TikTok Yanked From A...</td>\n",
       "      <td>Jun. 30, 2022 at 3:27 a.m. ET</td>\n",
       "      <td>by Barron's</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Meta Has a New Problem. Profit Forecasts Now L...</td>\n",
       "      <td>Jun. 29, 2022 at 1:18 p.m. ET</td>\n",
       "      <td>by Barron's</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Fed rolls out new index to flag early warning ...</td>\n",
       "      <td>Jun. 29, 2022 at 1:04 p.m. ET</td>\n",
       "      <td>by Joy Wiltermuth</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Apple Investors Have Something New to Focus On...</td>\n",
       "      <td>Jun. 29, 2022 at 12:41 p.m. ET</td>\n",
       "      <td>by Barron's</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Here’s why this trader is piling back into one...</td>\n",
       "      <td>Jun. 29, 2022 at 10:34 a.m. ET</td>\n",
       "      <td>by Barbara Kollmeyer</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               title  \\\n",
       "0  Gold falls more than 2% for the month, settles...   \n",
       "1  AMD stock gets an upgrade as analyst says rece...   \n",
       "2  All 30 Dow stocks are falling, with Goldman Sa...   \n",
       "3  Here’s how far oil could fall in a recession, ...   \n",
       "4  Crypto Winter Is Coming After SEC Rejects Key ...   \n",
       "5  An FCC Commissioner Wants TikTok Yanked From A...   \n",
       "6  Meta Has a New Problem. Profit Forecasts Now L...   \n",
       "7  Fed rolls out new index to flag early warning ...   \n",
       "8  Apple Investors Have Something New to Focus On...   \n",
       "9  Here’s why this trader is piling back into one...   \n",
       "\n",
       "                             time                author  \n",
       "0   Jun. 30, 2022 at 2:47 p.m. ET    by Joseph Adinolfi  \n",
       "1  Jun. 30, 2022 at 12:07 p.m. ET         by Emily Bary  \n",
       "2   Jun. 30, 2022 at 9:47 a.m. ET       by Tomi Kilgore  \n",
       "3   Jun. 30, 2022 at 8:34 a.m. ET    by Steve Goldstein  \n",
       "4   Jun. 30, 2022 at 6:37 a.m. ET                        \n",
       "5   Jun. 30, 2022 at 3:27 a.m. ET           by Barron's  \n",
       "6   Jun. 29, 2022 at 1:18 p.m. ET           by Barron's  \n",
       "7   Jun. 29, 2022 at 1:04 p.m. ET     by Joy Wiltermuth  \n",
       "8  Jun. 29, 2022 at 12:41 p.m. ET           by Barron's  \n",
       "9  Jun. 29, 2022 at 10:34 a.m. ET  by Barbara Kollmeyer  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"title\", \"time\", \"author\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Market Watch (Streaming)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.marketwatch_streaming import MarketWatch_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Only support the first page now!\n"
     ]
    }
   ],
   "source": [
    "news_downloader = MarketWatch_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>time</th>\n",
       "      <th>author</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Tech IPOs Should Be Heating Up. Why They’re Not.</td>\n",
       "      <td>Jun. 23, 2023 at 2:51 a.m. ET</td>\n",
       "      <td>by Barron's</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Everything Is Going Right for Tesla. It’s Time...</td>\n",
       "      <td>Jun. 23, 2023 at 1:30 a.m. ET</td>\n",
       "      <td>by Barron's</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>India’s Modi cracks jokes, chows down at swank...</td>\n",
       "      <td>Jun. 22, 2023 at 11:38 p.m. ET</td>\n",
       "      <td>by Associated Press</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Work-From-Home Job Openings Are Shrinking</td>\n",
       "      <td>Jun. 22, 2023 at 6:23 p.m. ET</td>\n",
       "      <td>by Barron's</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Congress Blasts E-Commerce Firm Temu Over Forc...</td>\n",
       "      <td>Jun. 22, 2023 at 5:44 p.m. ET</td>\n",
       "      <td>by Barron's</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Meta Platforms Inc. stock outperforms market o...</td>\n",
       "      <td>Jun. 22, 2023 at 5:32 p.m. ET</td>\n",
       "      <td>by MarketWatch Automation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Microsoft Corp. stock outperforms competitors ...</td>\n",
       "      <td>Jun. 22, 2023 at 5:32 p.m. ET</td>\n",
       "      <td>by MarketWatch Automation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Netflix Inc. stock underperforms Thursday when...</td>\n",
       "      <td>Jun. 22, 2023 at 5:32 p.m. ET</td>\n",
       "      <td>by MarketWatch Automation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>GameStop Corp. Cl A stock underperforms Thursd...</td>\n",
       "      <td>Jun. 22, 2023 at 5:29 p.m. ET</td>\n",
       "      <td>by MarketWatch Automation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Amazon.com Inc. stock outperforms market on st...</td>\n",
       "      <td>Jun. 22, 2023 at 5:25 p.m. ET</td>\n",
       "      <td>by MarketWatch Automation</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               title  \\\n",
       "0   Tech IPOs Should Be Heating Up. Why They’re Not.   \n",
       "1  Everything Is Going Right for Tesla. It’s Time...   \n",
       "2  India’s Modi cracks jokes, chows down at swank...   \n",
       "3          Work-From-Home Job Openings Are Shrinking   \n",
       "4  Congress Blasts E-Commerce Firm Temu Over Forc...   \n",
       "5  Meta Platforms Inc. stock outperforms market o...   \n",
       "6  Microsoft Corp. stock outperforms competitors ...   \n",
       "7  Netflix Inc. stock underperforms Thursday when...   \n",
       "8  GameStop Corp. Cl A stock underperforms Thursd...   \n",
       "9  Amazon.com Inc. stock outperforms market on st...   \n",
       "\n",
       "                             time                     author  \n",
       "0   Jun. 23, 2023 at 2:51 a.m. ET                by Barron's  \n",
       "1   Jun. 23, 2023 at 1:30 a.m. ET                by Barron's  \n",
       "2  Jun. 22, 2023 at 11:38 p.m. ET        by Associated Press  \n",
       "3   Jun. 22, 2023 at 6:23 p.m. ET                by Barron's  \n",
       "4   Jun. 22, 2023 at 5:44 p.m. ET                by Barron's  \n",
       "5   Jun. 22, 2023 at 5:32 p.m. ET  by MarketWatch Automation  \n",
       "6   Jun. 22, 2023 at 5:32 p.m. ET  by MarketWatch Automation  \n",
       "7   Jun. 22, 2023 at 5:32 p.m. ET  by MarketWatch Automation  \n",
       "8   Jun. 22, 2023 at 5:29 p.m. ET  by MarketWatch Automation  \n",
       "9   Jun. 22, 2023 at 5:25 p.m. ET  by MarketWatch Automation  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"title\", \"time\", \"author\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Penny Stock"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.pennystocks_streaming import PennyStocks_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requesting https://pennystocks.com ... succeed!\n",
      "Gathering again .. Remaining Retry: 4\n",
      "Only support the first page now!\n"
     ]
    }
   ],
   "source": [
    "news_downloader = PennyStocks_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>time</th>\n",
       "      <th>brief</th>\n",
       "      <th>reading_time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Best Penny Stocks to Buy Ahead Of Apple’s Even...</td>\n",
       "      <td>September 14, 2021</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat do penny stock in...</td>\n",
       "      <td>5 minute read</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>What Could The Apple Event Mean For Penny Stoc...</td>\n",
       "      <td>October 13, 2020</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWill The Apple Event M...</td>\n",
       "      <td>5 minute read</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3 Red Hot Penny Stocks To Watch Before Next We...</td>\n",
       "      <td>June 14, 2023</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch.\\n</td>\n",
       "      <td>4 minute read</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Penny Stocks Definition &amp; 7 Trading Strategies...</td>\n",
       "      <td>June 8, 2023</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat Is A Penny Stock?...</td>\n",
       "      <td>6 minute read</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Best Penny Stocks To Buy? 5 With Big News This...</td>\n",
       "      <td>June 7, 2023</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch ...</td>\n",
       "      <td>4 minute read</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Penny Stocks &amp; The Stock Market Today: Top Tre...</td>\n",
       "      <td>May 30, 2023</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat happened in the s...</td>\n",
       "      <td>6 minute read</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Penny Stocks To Buy? 3 AI Stocks To Watch Righ...</td>\n",
       "      <td>May 30, 2023</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tAI penny stocks to wat...</td>\n",
       "      <td>5 minute read</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>What Are Penny Stocks &amp; Should You Buy Them In...</td>\n",
       "      <td>May 19, 2023</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tShould You Find Penny ...</td>\n",
       "      <td>6 minute read</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Trading Penny Stocks: 3 High-Growth Industries...</td>\n",
       "      <td>May 8, 2023</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWatch these three indu...</td>\n",
       "      <td>7 minute read</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Fed Meeting Live Updates: 10 Takeaways From Ma...</td>\n",
       "      <td>May 3, 2023</td>\n",
       "      <td>\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tFOMC Statement From Ma...</td>\n",
       "      <td>10 minute read</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               title                time  \\\n",
       "0  Best Penny Stocks to Buy Ahead Of Apple’s Even...  September 14, 2021   \n",
       "0  What Could The Apple Event Mean For Penny Stoc...    October 13, 2020   \n",
       "0  3 Red Hot Penny Stocks To Watch Before Next We...       June 14, 2023   \n",
       "0  Penny Stocks Definition & 7 Trading Strategies...        June 8, 2023   \n",
       "0  Best Penny Stocks To Buy? 5 With Big News This...        June 7, 2023   \n",
       "0  Penny Stocks & The Stock Market Today: Top Tre...        May 30, 2023   \n",
       "0  Penny Stocks To Buy? 3 AI Stocks To Watch Righ...        May 30, 2023   \n",
       "0  What Are Penny Stocks & Should You Buy Them In...        May 19, 2023   \n",
       "0  Trading Penny Stocks: 3 High-Growth Industries...         May 8, 2023   \n",
       "0  Fed Meeting Live Updates: 10 Takeaways From Ma...         May 3, 2023   \n",
       "\n",
       "                                               brief    reading_time  \n",
       "0  \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat do penny stock in...   5 minute read  \n",
       "0  \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWill The Apple Event M...   5 minute read  \n",
       "0   \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch.\\n   4 minute read  \n",
       "0  \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat Is A Penny Stock?...   6 minute read  \n",
       "0  \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch ...   4 minute read  \n",
       "0  \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat happened in the s...   6 minute read  \n",
       "0  \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tAI penny stocks to wat...   5 minute read  \n",
       "0  \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tShould You Find Penny ...   6 minute read  \n",
       "0  \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWatch these three indu...   7 minute read  \n",
       "0  \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tFOMC Statement From Ma...  10 minute read  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"title\", \"time\", \"brief\", \"reading_time\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Seeking Alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.seekingalpha_date_range import SeekingAlpha_Date_Range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_date = \"2023-06-01\"\n",
    "end_date = \"2023-06-30\"\n",
    "stock = \"AAPL\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading Titles: 100%|██████████| 1/1 [00:00<?, ?it/s]\n"
     ]
    }
   ],
   "source": [
    "news_downloader = SeekingAlpha_Date_Range()\n",
    "news_downloader.download_date_range_stock(start_date, end_date, stock)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>publishOn</th>\n",
       "      <th>title</th>\n",
       "      <th>commentCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2023-06-19T09:00:00-04:00</td>\n",
       "      <td>Artificial intelligence is a '1995 moment' for...</td>\n",
       "      <td>63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2023-06-16T11:59:46-04:00</td>\n",
       "      <td>Citi: Don't worry about Big Tech fueling 2023'...</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2023-06-15T07:20:12-04:00</td>\n",
       "      <td>Google said to temper chatbot use for employee...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2023-06-14T05:17:31-04:00</td>\n",
       "      <td>Nvidia crosses $1T market cap powered by the r...</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2023-06-13T17:17:00-04:00</td>\n",
       "      <td>Intel in talks to be anchor investor in chip d...</td>\n",
       "      <td>94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2023-06-13T12:14:45-04:00</td>\n",
       "      <td>Hot Stocks: AAPL falls on downgrade; MANU rise...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2023-06-13T06:22:59-04:00</td>\n",
       "      <td>Apple notches record close as bulls continue t...</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2023-06-13T04:49:26-04:00</td>\n",
       "      <td>Apple cut to Neutral at UBS on softer iPhone a...</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2023-06-09T05:42:17-04:00</td>\n",
       "      <td>AI looking like a 'winner-take-more' game - Go...</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2023-06-09T05:21:04-04:00</td>\n",
       "      <td>Zuckerberg's vision for AR/VR headsets differe...</td>\n",
       "      <td>92</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   publishOn  \\\n",
       "0  2023-06-19T09:00:00-04:00   \n",
       "1  2023-06-16T11:59:46-04:00   \n",
       "2  2023-06-15T07:20:12-04:00   \n",
       "3  2023-06-14T05:17:31-04:00   \n",
       "4  2023-06-13T17:17:00-04:00   \n",
       "5  2023-06-13T12:14:45-04:00   \n",
       "6  2023-06-13T06:22:59-04:00   \n",
       "7  2023-06-13T04:49:26-04:00   \n",
       "8  2023-06-09T05:42:17-04:00   \n",
       "9  2023-06-09T05:21:04-04:00   \n",
       "\n",
       "                                               title  commentCount  \n",
       "0  Artificial intelligence is a '1995 moment' for...            63  \n",
       "1  Citi: Don't worry about Big Tech fueling 2023'...            17  \n",
       "2  Google said to temper chatbot use for employee...             8  \n",
       "3  Nvidia crosses $1T market cap powered by the r...            36  \n",
       "4  Intel in talks to be anchor investor in chip d...            94  \n",
       "5  Hot Stocks: AAPL falls on downgrade; MANU rise...             3  \n",
       "6  Apple notches record close as bulls continue t...            18  \n",
       "7  Apple cut to Neutral at UBS on softer iPhone a...            26  \n",
       "8  AI looking like a 'winner-take-more' game - Go...            39  \n",
       "9  Zuckerberg's vision for AR/VR headsets differe...            92  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"publishOn\",\"title\",\"commentCount\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reuters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.reuters_streaming import Reuters_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Geting pages: 1 2 3 "
     ]
    }
   ],
   "source": [
    "news_downloader = Reuters_Streaming()\n",
    "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>published_time</th>\n",
       "      <th>title</th>\n",
       "      <th>description</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2023-06-19T10:17:24.474Z</td>\n",
       "      <td>Hong Kong tycoon Jimmy Lai loses appeal agains...</td>\n",
       "      <td>A Hong Kong appeal court on Monday blocked jai...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2023-06-19T09:49:09.391Z</td>\n",
       "      <td>Podcast: Blinken meets Xi and Chinese bankers ...</td>\n",
       "      <td>U.S. Secretary of State Anthony Blinken is in ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2023-06-19T02:10:24.581Z</td>\n",
       "      <td>Their parents made China the world's factory. ...</td>\n",
       "      <td>When Steven Du took over his parents' factory ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2023-06-17T08:14:15.708Z</td>\n",
       "      <td>Japan to open up Apple- and Google-dominated p...</td>\n",
       "      <td>Japan plans to stoke competition in smartphone...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2023-06-16T13:28:02.538Z</td>\n",
       "      <td>Khashoggi's widow sues Israeli spyware company...</td>\n",
       "      <td>The widow of murdered Saudi journalist Jamal K...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2023-06-16T11:12:02.899Z</td>\n",
       "      <td>Factbox: DLE companies racing to reshape globa...</td>\n",
       "      <td>Lithium, the metal used to make electric vehic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2023-06-16T10:17:23.831Z</td>\n",
       "      <td>Podcast: US-Iran talks and Australia divided o...</td>\n",
       "      <td>Australia is divided on a historic referendum ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2023-06-16T02:16:44.523Z</td>\n",
       "      <td>Chinese e-commerce giants entice cautious cons...</td>\n",
       "      <td>China's e-commerce platforms are competing fie...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2023-06-15T10:21:02.697Z</td>\n",
       "      <td>S&amp;P 500 leaps to highest close in 14 months; t...</td>\n",
       "      <td>The S&amp;P 500 and Nasdaq surged on Thursday to c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2023-06-15T19:49:27.459Z</td>\n",
       "      <td>Microsoft notches record high valuation of nea...</td>\n",
       "      <td>Microsoft Corp shares rose to a new record hig...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             published_time  \\\n",
       "0  2023-06-19T10:17:24.474Z   \n",
       "1  2023-06-19T09:49:09.391Z   \n",
       "2  2023-06-19T02:10:24.581Z   \n",
       "3  2023-06-17T08:14:15.708Z   \n",
       "4  2023-06-16T13:28:02.538Z   \n",
       "5  2023-06-16T11:12:02.899Z   \n",
       "6  2023-06-16T10:17:23.831Z   \n",
       "7  2023-06-16T02:16:44.523Z   \n",
       "8  2023-06-15T10:21:02.697Z   \n",
       "9  2023-06-15T19:49:27.459Z   \n",
       "\n",
       "                                               title  \\\n",
       "0  Hong Kong tycoon Jimmy Lai loses appeal agains...   \n",
       "1  Podcast: Blinken meets Xi and Chinese bankers ...   \n",
       "2  Their parents made China the world's factory. ...   \n",
       "3  Japan to open up Apple- and Google-dominated p...   \n",
       "4  Khashoggi's widow sues Israeli spyware company...   \n",
       "5  Factbox: DLE companies racing to reshape globa...   \n",
       "6  Podcast: US-Iran talks and Australia divided o...   \n",
       "7  Chinese e-commerce giants entice cautious cons...   \n",
       "8  S&P 500 leaps to highest close in 14 months; t...   \n",
       "9  Microsoft notches record high valuation of nea...   \n",
       "\n",
       "                                         description  \n",
       "0  A Hong Kong appeal court on Monday blocked jai...  \n",
       "1  U.S. Secretary of State Anthony Blinken is in ...  \n",
       "2  When Steven Du took over his parents' factory ...  \n",
       "3  Japan plans to stoke competition in smartphone...  \n",
       "4  The widow of murdered Saudi journalist Jamal K...  \n",
       "5  Lithium, the metal used to make electric vehic...  \n",
       "6  Australia is divided on a historic referendum ...  \n",
       "7  China's e-commerce platforms are competing fie...  \n",
       "8  The S&P 500 and Nasdaq surged on Thursday to c...  \n",
       "9  Microsoft Corp shares rose to a new record hig...  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"published_time\",\"title\",\"description\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sina Finance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.sina_finance_date_range import Sina_Finance_Date_Range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_date = \"2016-01-01\"\n",
    "end_date = \"2016-01-01\"\n",
    "config = {\n",
    "    \"use_proxy\": \"china_free\",\n",
    "    \"max_retry\": 5,\n",
    "    \"proxy_pages\": 5,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Gathering free ips by pages...: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]\n",
      "Checking ips: 100%|██████████| 75/75 [00:20<00:00,  3.67it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "获取到的代理ip数量: 75 。Get proxy ips: 75.\n",
      "能用的代理数量： 75。Usable proxy ips: 75.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading Titles...: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it]\n",
      "Gathering news contents: 100%|██████████| 103/103 [00:22<00:00,  4.50it/s]\n"
     ]
    }
   ],
   "source": [
    "news_downloader = Sina_Finance_Date_Range(config)\n",
    "news_downloader.download_date_range_all(start_date,end_date)\n",
    "news_downloader.gather_content()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>分析师：伊朗重回国际原油市场无法阻止</td>\n",
       "      <td>新浪美股讯 北京时间1月1日晚CNBC称，加拿大皇家银行（RBC）分析师Helima Cro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>FAA：波音767的逃生扶梯存在缺陷</td>\n",
       "      <td>新浪美股讯 北京时间1日晚，美国联邦航空局（FAA）要求航空公司对波音767机型的救生扶梯进...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>非制造业新订单指数创新高 需求回升力度明显</td>\n",
       "      <td>中新社北京1月1日电 （记者 刘长忠）记者1日从中国物流与采购联合会获悉，在最新发布的201...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>雷曼兄弟针对大和证券提起索赔诉讼</td>\n",
       "      <td>新浪美股讯 北京时间1日下午共同社称，2008年破产的美国金融巨头雷曼兄弟公司的清算法人日前...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>国内钢铁PMI有所回升 钢市低迷形势有所改善</td>\n",
       "      <td>新华社上海1月1日专电（记者李荣）据中物联钢铁物流专业委员会1日发布的指数报告，2015年1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>马息岭凸显朝鲜旅游体育战略</td>\n",
       "      <td>新浪美股北京时间1日讯 三位单板滑雪手将成为最早拜访马息岭滑雪场的西方专业运动员，他们本月就...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>五洲船舶破产清算 近十年来首现国有船厂倒闭</td>\n",
       "      <td>（原标题：中国首家国有船厂破产倒闭）\\n低迷的中国造船市场，多年来首次出现国有船厂破产清算的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>过半城市房价环比上涨 百城住宅均价加速升温</td>\n",
       "      <td>资料图。中新社记者 武俊杰 摄\\n中新社北京1月1日电 (记者 庞无忌)中国房地产市场在20...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>经济学人：巴西病根到底在哪里</td>\n",
       "      <td>新浪美股北京时间1日讯 原本，巴西人是该高高兴兴迎接2016年的。8月间，里约热内卢将举办南...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>中国首家国有船厂破产倒闭:五洲船舶目前已停工</td>\n",
       "      <td>低迷的中国造船市场，多年来首次出现国有船厂破产清算的一幕。浙江海运集团旗下的五洲船舶修造公司...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    title                                            content\n",
       "0      分析师：伊朗重回国际原油市场无法阻止  新浪美股讯 北京时间1月1日晚CNBC称，加拿大皇家银行（RBC）分析师Helima Cro...\n",
       "1      FAA：波音767的逃生扶梯存在缺陷  新浪美股讯 北京时间1日晚，美国联邦航空局（FAA）要求航空公司对波音767机型的救生扶梯进...\n",
       "2   非制造业新订单指数创新高 需求回升力度明显  中新社北京1月1日电 （记者 刘长忠）记者1日从中国物流与采购联合会获悉，在最新发布的201...\n",
       "3        雷曼兄弟针对大和证券提起索赔诉讼  新浪美股讯 北京时间1日下午共同社称，2008年破产的美国金融巨头雷曼兄弟公司的清算法人日前...\n",
       "4  国内钢铁PMI有所回升 钢市低迷形势有所改善  新华社上海1月1日专电（记者李荣）据中物联钢铁物流专业委员会1日发布的指数报告，2015年1...\n",
       "5           马息岭凸显朝鲜旅游体育战略  新浪美股北京时间1日讯 三位单板滑雪手将成为最早拜访马息岭滑雪场的西方专业运动员，他们本月就...\n",
       "6   五洲船舶破产清算 近十年来首现国有船厂倒闭  （原标题：中国首家国有船厂破产倒闭）\\n低迷的中国造船市场，多年来首次出现国有船厂破产清算的...\n",
       "7   过半城市房价环比上涨 百城住宅均价加速升温  资料图。中新社记者 武俊杰 摄\\n中新社北京1月1日电 (记者 庞无忌)中国房地产市场在20...\n",
       "8          经济学人：巴西病根到底在哪里  新浪美股北京时间1日讯 原本，巴西人是该高高兴兴迎接2016年的。8月间，里约热内卢将举办南...\n",
       "9  中国首家国有船厂破产倒闭:五洲船舶目前已停工  低迷的中国造船市场，多年来首次出现国有船厂破产清算的一幕。浙江海运集团旗下的五洲船舶修造公司..."
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"title\", \"content\"]\n",
    "news_downloader.dataframe[selected_columns].head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eastmoney"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.eastmoney_streaming import Eastmoney_Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "pages = 3\n",
    "stock = \"600519\"\n",
    "config = {\n",
    "    \"use_proxy\": \"china_free\",\n",
    "    \"max_retry\": 5,\n",
    "    \"proxy_pages\": 5,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Gathering free ips by pages...: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]\n",
      "Checking ips: 100%|██████████| 75/75 [00:20<00:00,  3.62it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "获取到的代理ip数量: 75 。Get proxy ips: 75.\n",
      "能用的代理数量： 75。Usable proxy ips: 75.\n",
      "Geting pages: 0 1 2 Get total 3 pages.\n"
     ]
    }
   ],
   "source": [
    "news_downloader = Eastmoney_Streaming(config)\n",
    "news_downloader.download_streaming_stock(stock,pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read amount</th>\n",
       "      <th>comments</th>\n",
       "      <th>title</th>\n",
       "      <th>content link</th>\n",
       "      <th>author</th>\n",
       "      <th>create time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1407</td>\n",
       "      <td>6</td>\n",
       "      <td>茅台2022年报的12个小秘密</td>\n",
       "      <td>/news,600519,1295554981.html</td>\n",
       "      <td>贵州茅台资讯</td>\n",
       "      <td>04-09 19:40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>234</td>\n",
       "      <td>0</td>\n",
       "      <td>东北证券维持贵州茅台买入评级 预计2023年净利润同比</td>\n",
       "      <td>/news,600519,1295512910.html</td>\n",
       "      <td>公司研报提示</td>\n",
       "      <td>04-09 11:24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>385</td>\n",
       "      <td>0</td>\n",
       "      <td>贵州茅台：融资余额169.34亿元，创近一年新低（04-07</td>\n",
       "      <td>/news,600519,1295407809.html</td>\n",
       "      <td>贵州茅台资讯</td>\n",
       "      <td>04-08 07:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>233</td>\n",
       "      <td>0</td>\n",
       "      <td>贵州茅台：融资净买入1248.48万元，融资余额169.79亿</td>\n",
       "      <td>/news,600519,1294929438.html</td>\n",
       "      <td>贵州茅台资讯</td>\n",
       "      <td>04-07 07:28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2804</td>\n",
       "      <td>16</td>\n",
       "      <td>贵州茅台公益基金会正式成立</td>\n",
       "      <td>/news,600519,1294612056.html</td>\n",
       "      <td>贵州茅台资讯</td>\n",
       "      <td>04-06 12:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>333</td>\n",
       "      <td>0</td>\n",
       "      <td>贵州茅台04月04日获沪股通增持19.55万股</td>\n",
       "      <td>/news,600519,1294268016.html</td>\n",
       "      <td>贵州茅台资讯</td>\n",
       "      <td>04-05 07:48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>312</td>\n",
       "      <td>0</td>\n",
       "      <td>贵州茅台：融资余额169.66亿元，创近一年新低（04-04</td>\n",
       "      <td>/news,600519,1294265710.html</td>\n",
       "      <td>贵州茅台资讯</td>\n",
       "      <td>04-05 07:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>22721</td>\n",
       "      <td>16</td>\n",
       "      <td>4月4日北向资金最新动向（附十大成交股）</td>\n",
       "      <td>/news,600519,1294192188.html</td>\n",
       "      <td>贵州茅台资讯</td>\n",
       "      <td>04-04 18:48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>654</td>\n",
       "      <td>1</td>\n",
       "      <td>大宗交易：贵州茅台成交235.9万元，成交价1814.59元（</td>\n",
       "      <td>/news,600519,1294173281.html</td>\n",
       "      <td>贵州茅台资讯</td>\n",
       "      <td>04-04 17:21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>233</td>\n",
       "      <td>0</td>\n",
       "      <td>第一上海证券维持贵州茅台买入评级 目标价2428.8元</td>\n",
       "      <td>/news,600519,1293784734.html</td>\n",
       "      <td>公司研报提示</td>\n",
       "      <td>04-04 09:30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  read amount comments                            title  \\\n",
       "0        1407        6                  茅台2022年报的12个小秘密   \n",
       "1         234        0      东北证券维持贵州茅台买入评级 预计2023年净利润同比   \n",
       "2         385        0   贵州茅台：融资余额169.34亿元，创近一年新低（04-07   \n",
       "3         233        0  贵州茅台：融资净买入1248.48万元，融资余额169.79亿   \n",
       "4        2804       16                    贵州茅台公益基金会正式成立   \n",
       "5         333        0          贵州茅台04月04日获沪股通增持19.55万股   \n",
       "6         312        0   贵州茅台：融资余额169.66亿元，创近一年新低（04-04   \n",
       "7       22721       16             4月4日北向资金最新动向（附十大成交股）   \n",
       "8         654        1  大宗交易：贵州茅台成交235.9万元，成交价1814.59元（   \n",
       "9         233        0      第一上海证券维持贵州茅台买入评级 目标价2428.8元   \n",
       "\n",
       "                   content link  author  create time  \n",
       "0  /news,600519,1295554981.html  贵州茅台资讯  04-09 19:40  \n",
       "1  /news,600519,1295512910.html  公司研报提示  04-09 11:24  \n",
       "2  /news,600519,1295407809.html  贵州茅台资讯  04-08 07:30  \n",
       "3  /news,600519,1294929438.html  贵州茅台资讯  04-07 07:28  \n",
       "4  /news,600519,1294612056.html  贵州茅台资讯  04-06 12:29  \n",
       "5  /news,600519,1294268016.html  贵州茅台资讯  04-05 07:48  \n",
       "6  /news,600519,1294265710.html  贵州茅台资讯  04-05 07:30  \n",
       "7  /news,600519,1294192188.html  贵州茅台资讯  04-04 18:48  \n",
       "8  /news,600519,1294173281.html  贵州茅台资讯  04-04 17:21  \n",
       "9  /news,600519,1293784734.html  公司研报提示  04-04 09:30  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"title\", \"create time\"]\n",
    "news_downloader[selected_columns].dataframe.head(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Finnhub / Yahoo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finnlp.data_sources.news.finnhub_date_range import Finnhub_Date_Range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_date = \"2023-01-01\"\n",
    "end_date = \"2023-01-03\"\n",
    "config = {\n",
    "    \"use_proxy\": \"us_free\",\n",
    "    \"max_retry\": 5,\n",
    "    \"proxy_pages\": 5,\n",
    "    \"token\": \"YOUR_FINNHUB_TOKEN\"  # Avaliable at https://finnhub.io/dashboard\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Checking ips: 100%|██████████| 75/75 [02:51<00:00,  2.28s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Get proxy ips: 75.\n",
      "Usable proxy ips: 75.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading Titles: 100%|██████████| 1/1 [00:02<00:00,  2.66s/it]\n",
      "Gathering news contents:  48%|████▊     | 49/102 [03:18<02:51,  3.24s/it]c:\\Users\\Olive\\.conda\\envs\\finrl\\lib\\site-packages\\urllib3\\connectionpool.py:1052: InsecureRequestWarning: Unverified HTTPS request is being made to host 'thefly.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
      "  InsecureRequestWarning,\n",
      "Gathering news contents: 100%|██████████| 102/102 [06:15<00:00,  3.68s/it]\n"
     ]
    }
   ],
   "source": [
    "news_downloader = Finnhub_Date_Range(config)\n",
    "news_downloader.download_date_range_stock(start_date,end_date)\n",
    "news_downloader.gather_content()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>datetime</th>\n",
       "      <th>headline</th>\n",
       "      <th>id</th>\n",
       "      <th>image</th>\n",
       "      <th>related</th>\n",
       "      <th>source</th>\n",
       "      <th>summary</th>\n",
       "      <th>url</th>\n",
       "      <th>content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>company</td>\n",
       "      <td>2023-01-03 23:40:08</td>\n",
       "      <td>My 26-Stock $349k Portfolio Gets A Nice Petrob...</td>\n",
       "      <td>118107004</td>\n",
       "      <td>https://media.gettyimages.com/id/1441204186/ph...</td>\n",
       "      <td>AAPL</td>\n",
       "      <td>SeekingAlpha</td>\n",
       "      <td>My portfolio, built specifically for my retire...</td>\n",
       "      <td>https://finnhub.io/api/news?id=d3c15f6f365663b...</td>\n",
       "      <td>Home\\nInvesting Strategy\\nPortfolio Strategy\\n...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>company</td>\n",
       "      <td>2023-01-03 22:09:00</td>\n",
       "      <td>Apple’s Market Cap Slides Below $2 Trillion fo...</td>\n",
       "      <td>118105849</td>\n",
       "      <td></td>\n",
       "      <td>AAPL</td>\n",
       "      <td>Yahoo</td>\n",
       "      <td>The tech giant is one of only five U.S. compan...</td>\n",
       "      <td>https://finnhub.io/api/news?id=42343678a7474e1...</td>\n",
       "      <td>Error</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  category            datetime  \\\n",
       "0  company 2023-01-03 23:40:08   \n",
       "1  company 2023-01-03 22:09:00   \n",
       "\n",
       "                                            headline         id  \\\n",
       "0  My 26-Stock $349k Portfolio Gets A Nice Petrob...  118107004   \n",
       "1  Apple’s Market Cap Slides Below $2 Trillion fo...  118105849   \n",
       "\n",
       "                                               image related        source  \\\n",
       "0  https://media.gettyimages.com/id/1441204186/ph...    AAPL  SeekingAlpha   \n",
       "1                                                       AAPL         Yahoo   \n",
       "\n",
       "                                             summary  \\\n",
       "0  My portfolio, built specifically for my retire...   \n",
       "1  The tech giant is one of only five U.S. compan...   \n",
       "\n",
       "                                                 url  \\\n",
       "0  https://finnhub.io/api/news?id=d3c15f6f365663b...   \n",
       "1  https://finnhub.io/api/news?id=42343678a7474e1...   \n",
       "\n",
       "                                             content  \n",
       "0  Home\\nInvesting Strategy\\nPortfolio Strategy\\n...  \n",
       "1                                              Error  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = news_downloader.dataframe\n",
    "df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>headline</th>\n",
       "      <th>content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>My 26-Stock $349k Portfolio Gets A Nice Petrob...</td>\n",
       "      <td>Home\\nInvesting Strategy\\nPortfolio Strategy\\n...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Apple’s Market Cap Slides Below $2 Trillion fo...</td>\n",
       "      <td>Error</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>US STOCKS-Wall St starts the year with a dip; ...</td>\n",
       "      <td>(For a Reuters live blog on U.S., UK and Europ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Buy 4 January Dogs Of The Dow, Watch 4 More</td>\n",
       "      <td>Home\\nDividends\\nDividend Quick Picks\\nBuy 4 J...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Apple's stock market value falls below $2 tril...</td>\n",
       "      <td>Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>CORRECTED-UPDATE 1-Apple's stock market value ...</td>\n",
       "      <td>Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Apple Stock Falls Amid Report Of Product Order...</td>\n",
       "      <td>Apple stock got off to a slow start in 2023 as...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>US STOCKS-Wall St starts the year with a dip; ...</td>\n",
       "      <td>Summary\\nCompanies\\nTesla shares plunge on Q4 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>More than $1 trillion wiped off value of Apple...</td>\n",
       "      <td>apple store\\nMore than $1 trillion has been wi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>McLean's Iridium inks agreement to put its sat...</td>\n",
       "      <td>The company hasn't named its partner, but it's...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            headline  \\\n",
       "0  My 26-Stock $349k Portfolio Gets A Nice Petrob...   \n",
       "1  Apple’s Market Cap Slides Below $2 Trillion fo...   \n",
       "2  US STOCKS-Wall St starts the year with a dip; ...   \n",
       "3        Buy 4 January Dogs Of The Dow, Watch 4 More   \n",
       "4  Apple's stock market value falls below $2 tril...   \n",
       "5  CORRECTED-UPDATE 1-Apple's stock market value ...   \n",
       "6  Apple Stock Falls Amid Report Of Product Order...   \n",
       "7  US STOCKS-Wall St starts the year with a dip; ...   \n",
       "8  More than $1 trillion wiped off value of Apple...   \n",
       "9  McLean's Iridium inks agreement to put its sat...   \n",
       "\n",
       "                                             content  \n",
       "0  Home\\nInvesting Strategy\\nPortfolio Strategy\\n...  \n",
       "1                                              Error  \n",
       "2  (For a Reuters live blog on U.S., UK and Europ...  \n",
       "3  Home\\nDividends\\nDividend Quick Picks\\nBuy 4 J...  \n",
       "4  Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto...  \n",
       "5  Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto...  \n",
       "6  Apple stock got off to a slow start in 2023 as...  \n",
       "7  Summary\\nCompanies\\nTesla shares plunge on Q4 ...  \n",
       "8  apple store\\nMore than $1 trillion has been wi...  \n",
       "9  The company hasn't named its partner, but it's...  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_columns = [\"headline\", \"content\"]\n",
    "df[selected_columns].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "finrl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
